diff --git a/agent/agent.go b/agent/agent.go index 6616784e41..82efc2fae5 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -420,7 +420,12 @@ func (a *agent) initSocketServer() { // startBoundaryLogProxyServer starts the boundary log proxy socket server. func (a *agent) startBoundaryLogProxyServer() { - proxy := boundarylogproxy.NewServer(a.logger, a.boundaryLogProxySocketPath) + if a.boundaryLogProxySocketPath == "" { + a.logger.Warn(a.hardCtx, "boundary log proxy socket path not defined; not starting proxy") + return + } + + proxy := boundarylogproxy.NewServer(a.logger, a.boundaryLogProxySocketPath, a.prometheusRegistry) if err := proxy.Start(); err != nil { a.logger.Warn(a.hardCtx, "failed to start boundary log proxy", slog.Error(err)) return diff --git a/agent/boundary_logs_test.go b/agent/boundary_logs_test.go index 5701e0dc43..66a8786a98 100644 --- a/agent/boundary_logs_test.go +++ b/agent/boundary_logs_test.go @@ -10,6 +10,7 @@ import ( "testing" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/timestamppb" @@ -69,7 +70,7 @@ func TestBoundaryLogs_EndToEnd(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) diff --git a/agent/boundarylogproxy/codec/boundary.pb.go b/agent/boundarylogproxy/codec/boundary.pb.go index 86b18361b7..38c60734b8 100644 --- a/agent/boundarylogproxy/codec/boundary.pb.go +++ b/agent/boundarylogproxy/codec/boundary.pb.go @@ -34,6 +34,7 @@ type BoundaryMessage struct { // Types that are assignable to Msg: // // *BoundaryMessage_Logs + // *BoundaryMessage_Status Msg isBoundaryMessage_Msg `protobuf_oneof:"msg"` } @@ -83,6 +84,13 @@ func (x *BoundaryMessage) GetLogs() *proto.ReportBoundaryLogsRequest { return nil } +func (x *BoundaryMessage) GetStatus() *BoundaryStatus { + if x, ok := x.GetMsg().(*BoundaryMessage_Status); ok { + return x.Status + } + return nil +} + type isBoundaryMessage_Msg interface { isBoundaryMessage_Msg() } @@ -91,8 +99,75 @@ type BoundaryMessage_Logs struct { Logs *proto.ReportBoundaryLogsRequest `protobuf:"bytes,1,opt,name=logs,proto3,oneof"` } +type BoundaryMessage_Status struct { + Status *BoundaryStatus `protobuf:"bytes,2,opt,name=status,proto3,oneof"` +} + func (*BoundaryMessage_Logs) isBoundaryMessage_Msg() {} +func (*BoundaryMessage_Status) isBoundaryMessage_Msg() {} + +// BoundaryStatus carries operational metadata from boundary to the agent. +// The agent records these values as Prometheus metrics. This message is +// never forwarded to coderd. +type BoundaryStatus struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // Logs dropped because boundary's internal channel buffer was full. + DroppedChannelFull int64 `protobuf:"varint,1,opt,name=dropped_channel_full,json=droppedChannelFull,proto3" json:"dropped_channel_full,omitempty"` + // Logs dropped because boundary's batch buffer was full after a + // failed flush attempt. + DroppedBatchFull int64 `protobuf:"varint,2,opt,name=dropped_batch_full,json=droppedBatchFull,proto3" json:"dropped_batch_full,omitempty"` +} + +func (x *BoundaryStatus) Reset() { + *x = BoundaryStatus{} + if protoimpl.UnsafeEnabled { + mi := &file_agent_boundarylogproxy_codec_boundary_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *BoundaryStatus) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BoundaryStatus) ProtoMessage() {} + +func (x *BoundaryStatus) ProtoReflect() protoreflect.Message { + mi := &file_agent_boundarylogproxy_codec_boundary_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BoundaryStatus.ProtoReflect.Descriptor instead. +func (*BoundaryStatus) Descriptor() ([]byte, []int) { + return file_agent_boundarylogproxy_codec_boundary_proto_rawDescGZIP(), []int{1} +} + +func (x *BoundaryStatus) GetDroppedChannelFull() int64 { + if x != nil { + return x.DroppedChannelFull + } + return 0 +} + +func (x *BoundaryStatus) GetDroppedBatchFull() int64 { + if x != nil { + return x.DroppedBatchFull + } + return 0 +} + var File_agent_boundarylogproxy_codec_boundary_proto protoreflect.FileDescriptor var file_agent_boundarylogproxy_codec_boundary_proto_rawDesc = []byte{ @@ -102,17 +177,29 @@ var file_agent_boundarylogproxy_codec_boundary_proto_rawDesc = []byte{ 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x6c, 0x6f, 0x67, 0x70, 0x72, 0x6f, 0x78, 0x79, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x63, 0x2e, 0x76, 0x31, 0x1a, 0x17, 0x61, 0x67, 0x65, 0x6e, 0x74, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x61, 0x67, 0x65, 0x6e, - 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x59, 0x0a, 0x0f, 0x42, 0x6f, 0x75, 0x6e, 0x64, - 0x61, 0x72, 0x79, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x3f, 0x0a, 0x04, 0x6c, 0x6f, - 0x67, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x29, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x72, - 0x2e, 0x61, 0x67, 0x65, 0x6e, 0x74, 0x2e, 0x76, 0x32, 0x2e, 0x52, 0x65, 0x70, 0x6f, 0x72, 0x74, - 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x71, 0x75, - 0x65, 0x73, 0x74, 0x48, 0x00, 0x52, 0x04, 0x6c, 0x6f, 0x67, 0x73, 0x42, 0x05, 0x0a, 0x03, 0x6d, - 0x73, 0x67, 0x42, 0x38, 0x5a, 0x36, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, - 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2f, 0x76, 0x32, 0x2f, - 0x61, 0x67, 0x65, 0x6e, 0x74, 0x2f, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x6c, 0x6f, - 0x67, 0x70, 0x72, 0x6f, 0x78, 0x79, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x63, 0x62, 0x06, 0x70, 0x72, - 0x6f, 0x74, 0x6f, 0x33, + 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xa4, 0x01, 0x0a, 0x0f, 0x42, 0x6f, 0x75, 0x6e, + 0x64, 0x61, 0x72, 0x79, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x3f, 0x0a, 0x04, 0x6c, + 0x6f, 0x67, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x29, 0x2e, 0x63, 0x6f, 0x64, 0x65, + 0x72, 0x2e, 0x61, 0x67, 0x65, 0x6e, 0x74, 0x2e, 0x76, 0x32, 0x2e, 0x52, 0x65, 0x70, 0x6f, 0x72, + 0x74, 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x48, 0x00, 0x52, 0x04, 0x6c, 0x6f, 0x67, 0x73, 0x12, 0x49, 0x0a, 0x06, + 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x2f, 0x2e, 0x63, + 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x6c, 0x6f, 0x67, + 0x70, 0x72, 0x6f, 0x78, 0x79, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x63, 0x2e, 0x76, 0x31, 0x2e, 0x42, + 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x48, 0x00, 0x52, + 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x42, 0x05, 0x0a, 0x03, 0x6d, 0x73, 0x67, 0x22, 0x70, + 0x0a, 0x0e, 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x12, 0x30, 0x0a, 0x14, 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 0x6e, + 0x6e, 0x65, 0x6c, 0x5f, 0x66, 0x75, 0x6c, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x12, + 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x43, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x46, 0x75, + 0x6c, 0x6c, 0x12, 0x2c, 0x0a, 0x12, 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x5f, 0x62, 0x61, + 0x74, 0x63, 0x68, 0x5f, 0x66, 0x75, 0x6c, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x10, + 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x42, 0x61, 0x74, 0x63, 0x68, 0x46, 0x75, 0x6c, 0x6c, + 0x42, 0x38, 0x5a, 0x36, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x63, + 0x6f, 0x64, 0x65, 0x72, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2f, 0x76, 0x32, 0x2f, 0x61, 0x67, + 0x65, 0x6e, 0x74, 0x2f, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x6c, 0x6f, 0x67, 0x70, + 0x72, 0x6f, 0x78, 0x79, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x63, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x33, } var ( @@ -127,18 +214,20 @@ func file_agent_boundarylogproxy_codec_boundary_proto_rawDescGZIP() []byte { return file_agent_boundarylogproxy_codec_boundary_proto_rawDescData } -var file_agent_boundarylogproxy_codec_boundary_proto_msgTypes = make([]protoimpl.MessageInfo, 1) +var file_agent_boundarylogproxy_codec_boundary_proto_msgTypes = make([]protoimpl.MessageInfo, 2) var file_agent_boundarylogproxy_codec_boundary_proto_goTypes = []interface{}{ (*BoundaryMessage)(nil), // 0: coder.boundarylogproxy.codec.v1.BoundaryMessage - (*proto.ReportBoundaryLogsRequest)(nil), // 1: coder.agent.v2.ReportBoundaryLogsRequest + (*BoundaryStatus)(nil), // 1: coder.boundarylogproxy.codec.v1.BoundaryStatus + (*proto.ReportBoundaryLogsRequest)(nil), // 2: coder.agent.v2.ReportBoundaryLogsRequest } var file_agent_boundarylogproxy_codec_boundary_proto_depIdxs = []int32{ - 1, // 0: coder.boundarylogproxy.codec.v1.BoundaryMessage.logs:type_name -> coder.agent.v2.ReportBoundaryLogsRequest - 1, // [1:1] is the sub-list for method output_type - 1, // [1:1] is the sub-list for method input_type - 1, // [1:1] is the sub-list for extension type_name - 1, // [1:1] is the sub-list for extension extendee - 0, // [0:1] is the sub-list for field type_name + 2, // 0: coder.boundarylogproxy.codec.v1.BoundaryMessage.logs:type_name -> coder.agent.v2.ReportBoundaryLogsRequest + 1, // 1: coder.boundarylogproxy.codec.v1.BoundaryMessage.status:type_name -> coder.boundarylogproxy.codec.v1.BoundaryStatus + 2, // [2:2] is the sub-list for method output_type + 2, // [2:2] is the sub-list for method input_type + 2, // [2:2] is the sub-list for extension type_name + 2, // [2:2] is the sub-list for extension extendee + 0, // [0:2] is the sub-list for field type_name } func init() { file_agent_boundarylogproxy_codec_boundary_proto_init() } @@ -159,9 +248,22 @@ func file_agent_boundarylogproxy_codec_boundary_proto_init() { return nil } } + file_agent_boundarylogproxy_codec_boundary_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*BoundaryStatus); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } } file_agent_boundarylogproxy_codec_boundary_proto_msgTypes[0].OneofWrappers = []interface{}{ (*BoundaryMessage_Logs)(nil), + (*BoundaryMessage_Status)(nil), } type x struct{} out := protoimpl.TypeBuilder{ @@ -169,7 +271,7 @@ func file_agent_boundarylogproxy_codec_boundary_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_agent_boundarylogproxy_codec_boundary_proto_rawDesc, NumEnums: 0, - NumMessages: 1, + NumMessages: 2, NumExtensions: 0, NumServices: 0, }, diff --git a/agent/boundarylogproxy/codec/boundary.proto b/agent/boundarylogproxy/codec/boundary.proto index ed13160c74..53411785e2 100644 --- a/agent/boundarylogproxy/codec/boundary.proto +++ b/agent/boundarylogproxy/codec/boundary.proto @@ -13,5 +13,17 @@ import "agent/proto/agent.proto"; message BoundaryMessage { oneof msg { coder.agent.v2.ReportBoundaryLogsRequest logs = 1; + BoundaryStatus status = 2; } } + +// BoundaryStatus carries operational metadata from boundary to the agent. +// The agent records these values as Prometheus metrics. This message is +// never forwarded to coderd. +message BoundaryStatus { + // Logs dropped because boundary's internal channel buffer was full. + int64 dropped_channel_full = 1; + // Logs dropped because boundary's batch buffer was full after a + // failed flush attempt. + int64 dropped_batch_full = 2; +} diff --git a/agent/boundarylogproxy/metrics.go b/agent/boundarylogproxy/metrics.go new file mode 100644 index 0000000000..6ba2fb188c --- /dev/null +++ b/agent/boundarylogproxy/metrics.go @@ -0,0 +1,77 @@ +package boundarylogproxy + +import "github.com/prometheus/client_golang/prometheus" + +// Metrics tracks observability for the boundary -> agent -> coderd audit log +// pipeline. +// +// Audit logs from boundary workspaces pass through several async buffers +// before reaching coderd, and any stage can silently drop data. These +// metrics make that loss visible so operators/devs can: +// +// - Bubble up data loss: a non-zero drop rate means audit logs are being +// lost, which may have auditing implications. +// - Identify the bottleneck: the reason label pinpoints where drops +// occur: boundary's internal buffers, the agent's channel, or the +// RPC to coderd. +// - Tune buffer sizes: sustained "buffer_full" drops indicate the +// agent's channel (or boundary's batch buffer) is too small for the +// workload. Combined with batches_forwarded_total you can compute a +// drop rate: drops / (drops + forwards). +// - Detect batch forwarding issues: "forward_failed" drops increase when +// the agent cannot reach coderd. +// +// Drops are captured at two stages: +// - Agent-side: the agent's channel buffer overflows (reason +// "buffer_full") or the RPC forward to coderd fails (reason +// "forward_failed"). +// - Boundary-reported: boundary self-reports drops via BoundaryStatus +// messages (reasons "boundary_channel_full", "boundary_batch_full"). +// These arrive on the next successful flush from boundary. +// +// There are circumstances where metrics could be lost e.g., agent restarts, +// boundary crashes, or the agent shuts down when the DRPC connection is down. +type Metrics struct { + batchesDropped *prometheus.CounterVec + logsDropped *prometheus.CounterVec + batchesForwarded prometheus.Counter +} + +func newMetrics(registerer prometheus.Registerer) *Metrics { + batchesDropped := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "boundary_log_proxy", + Name: "batches_dropped_total", + Help: "Total number of boundary log batches dropped before reaching coderd. " + + "Reason: buffer_full = the agent's internal buffer is full, meaning boundary is producing logs faster than the agent can forward them to coderd; " + + "forward_failed = the agent failed to send the batch to coderd, potentially because coderd is unreachable or the connection was interrupted.", + }, []string{"reason"}) + registerer.MustRegister(batchesDropped) + + logsDropped := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "boundary_log_proxy", + Name: "logs_dropped_total", + Help: "Total number of individual boundary log entries dropped before reaching coderd. " + + "Reason: buffer_full = the agent's internal buffer is full; " + + "forward_failed = the agent failed to send the batch to coderd; " + + "boundary_channel_full = boundary's internal send channel overflowed, meaning boundary is generating logs faster than it can batch and send them; " + + "boundary_batch_full = boundary's outgoing batch buffer overflowed after a failed flush, meaning boundary could not write to the agent's socket.", + }, []string{"reason"}) + registerer.MustRegister(logsDropped) + + batchesForwarded := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "boundary_log_proxy", + Name: "batches_forwarded_total", + Help: "Total number of boundary log batches successfully forwarded to coderd. " + + "Compare with batches_dropped_total to compute a drop rate.", + }) + registerer.MustRegister(batchesForwarded) + + return &Metrics{ + batchesDropped: batchesDropped, + logsDropped: logsDropped, + batchesForwarded: batchesForwarded, + } +} diff --git a/agent/boundarylogproxy/proxy.go b/agent/boundarylogproxy/proxy.go index d368a80bc8..9a0ef8c14d 100644 --- a/agent/boundarylogproxy/proxy.go +++ b/agent/boundarylogproxy/proxy.go @@ -11,6 +11,7 @@ import ( "path/filepath" "sync" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/xerrors" "google.golang.org/protobuf/proto" @@ -26,6 +27,13 @@ const ( logBufferSize = 100 ) +const ( + droppedReasonBoundaryChannelFull = "boundary_channel_full" + droppedReasonBoundaryBatchFull = "boundary_batch_full" + droppedReasonBufferFull = "buffer_full" + droppedReasonForwardFailed = "forward_failed" +) + // DefaultSocketPath returns the default path for the boundary audit log socket. func DefaultSocketPath() string { return filepath.Join(os.TempDir(), "boundary-audit.sock") @@ -43,6 +51,7 @@ type Reporter interface { type Server struct { logger slog.Logger socketPath string + metrics *Metrics listener net.Listener cancel context.CancelFunc @@ -53,10 +62,11 @@ type Server struct { } // NewServer creates a new boundary log proxy server. -func NewServer(logger slog.Logger, socketPath string) *Server { +func NewServer(logger slog.Logger, socketPath string, registerer prometheus.Registerer) *Server { return &Server{ logger: logger.Named("boundary-log-proxy"), socketPath: socketPath, + metrics: newMetrics(registerer), logs: make(chan *agentproto.ReportBoundaryLogsRequest, logBufferSize), } } @@ -100,9 +110,13 @@ func (s *Server) RunForwarder(ctx context.Context, sender Reporter) error { s.logger.Warn(ctx, "failed to forward boundary logs", slog.Error(err), slog.F("log_count", len(req.Logs))) + s.metrics.batchesDropped.WithLabelValues(droppedReasonForwardFailed).Inc() + s.metrics.logsDropped.WithLabelValues(droppedReasonForwardFailed).Add(float64(len(req.Logs))) // Continue forwarding other logs. The current batch is lost, // but the socket stays alive. + continue } + s.metrics.batchesForwarded.Inc() } } } @@ -177,6 +191,8 @@ func (s *Server) handleMessage(ctx context.Context, msg proto.Message) { switch inner := m.Msg.(type) { case *codec.BoundaryMessage_Logs: s.bufferLogs(ctx, inner.Logs) + case *codec.BoundaryMessage_Status: + s.recordBoundaryStatus(inner.Status) default: s.logger.Warn(ctx, "unknown BoundaryMessage variant") } @@ -185,12 +201,23 @@ func (s *Server) handleMessage(ctx context.Context, msg proto.Message) { } } +func (s *Server) recordBoundaryStatus(status *codec.BoundaryStatus) { + if n := status.DroppedChannelFull; n > 0 { + s.metrics.logsDropped.WithLabelValues(droppedReasonBoundaryChannelFull).Add(float64(n)) + } + if n := status.DroppedBatchFull; n > 0 { + s.metrics.logsDropped.WithLabelValues(droppedReasonBoundaryBatchFull).Add(float64(n)) + } +} + func (s *Server) bufferLogs(ctx context.Context, req *agentproto.ReportBoundaryLogsRequest) { select { case s.logs <- req: default: s.logger.Warn(ctx, "dropping boundary logs, buffer full", slog.F("log_count", len(req.Logs))) + s.metrics.batchesDropped.WithLabelValues(droppedReasonBufferFull).Inc() + s.metrics.logsDropped.WithLabelValues(droppedReasonBufferFull).Add(float64(len(req.Logs))) } } diff --git a/agent/boundarylogproxy/proxy_test.go b/agent/boundarylogproxy/proxy_test.go index d6bf9ec489..8fadeaeeed 100644 --- a/agent/boundarylogproxy/proxy_test.go +++ b/agent/boundarylogproxy/proxy_test.go @@ -11,6 +11,7 @@ import ( "testing" "time" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" "google.golang.org/protobuf/types/known/timestamppb" @@ -45,6 +46,20 @@ func sendLogs(t *testing.T, conn net.Conn, req *agentproto.ReportBoundaryLogsReq } } +// sendStatus writes a BoundaryMessage envelope containing a BoundaryStatus +// to the connection using TagV2. +func sendStatus(t *testing.T, conn net.Conn, status *codec.BoundaryStatus) { + t.Helper() + + msg := &codec.BoundaryMessage{ + Msg: &codec.BoundaryMessage_Status{Status: status}, + } + err := codec.WriteMessage(conn, codec.TagV2, msg) + if err != nil { + t.Errorf("write status: %s", err) + } +} + // fakeReporter implements boundarylogproxy.Reporter for testing. type fakeReporter struct { mu sync.Mutex @@ -87,7 +102,7 @@ func TestServer_StartAndClose(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) @@ -106,7 +121,7 @@ func TestServer_ReceiveAndForwardLogs(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -166,7 +181,7 @@ func TestServer_MultipleMessages(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -218,7 +233,7 @@ func TestServer_MultipleConnections(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -279,7 +294,7 @@ func TestServer_MessageTooLarge(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) @@ -307,7 +322,7 @@ func TestServer_ForwarderContinuesAfterError(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) @@ -392,7 +407,7 @@ func TestServer_CloseStopsForwarder(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) @@ -421,7 +436,7 @@ func TestServer_InvalidProtobuf(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) @@ -480,7 +495,7 @@ func TestServer_InvalidHeader(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) @@ -530,7 +545,7 @@ func TestServer_AllowRequest(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) err := srv.Start() require.NoError(t, err) @@ -588,7 +603,7 @@ func TestServer_TagV1BackwardsCompatibility(t *testing.T) { t.Parallel() socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") - srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath) + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, prometheus.NewRegistry()) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -660,3 +675,181 @@ func TestServer_TagV1BackwardsCompatibility(t *testing.T) { cancel() <-forwarderDone } + +func TestServer_Metrics(t *testing.T) { + t.Parallel() + + makeReq := func(n int) *agentproto.ReportBoundaryLogsRequest { + logs := make([]*agentproto.BoundaryLog, n) + for i := range n { + logs[i] = &agentproto.BoundaryLog{ + Allowed: true, + Time: timestamppb.Now(), + Resource: &agentproto.BoundaryLog_HttpRequest_{ + HttpRequest: &agentproto.BoundaryLog_HttpRequest{ + Method: "GET", + Url: "https://example.com", + }, + }, + } + } + return &agentproto.ReportBoundaryLogsRequest{Logs: logs} + } + + // BufferFull needs its own setup because it intentionally does not run + // a forwarder so the channel fills up. + t.Run("BufferFull", func(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, reg) + + err := srv.Start() + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, srv.Close()) }) + + conn, err := net.Dial("unix", socketPath) + require.NoError(t, err) + defer conn.Close() + + // Fill the buffer (size 100) without running a forwarder so nothing + // drains. Then send one more to trigger the drop path. + for range 101 { + sendLogs(t, conn, makeReq(1)) + } + + require.Eventually(t, func() bool { + return getCounterVecValue(t, reg, "agent_boundary_log_proxy_batches_dropped_total", "buffer_full") >= 1 + }, testutil.WaitShort, testutil.IntervalFast) + require.GreaterOrEqual(t, + getCounterVecValue(t, reg, "agent_boundary_log_proxy_logs_dropped_total", "buffer_full"), + float64(1)) + }) + + // The remaining metrics share one server, forwarder, and connection. The + // phases run sequentially so metrics accumulate. + t.Run("Forwarding", func(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + socketPath := filepath.Join(testutil.TempDirUnixSocket(t), "boundary.sock") + srv := boundarylogproxy.NewServer(testutil.Logger(t), socketPath, reg) + + err := srv.Start() + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, srv.Close()) }) + + reportNotify := make(chan struct{}, 4) + reporter := &fakeReporter{ + err: context.DeadlineExceeded, + errOnce: true, + reportCb: func() { + select { + case reportNotify <- struct{}{}: + default: + } + }, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + forwarderDone := make(chan error, 1) + go func() { + forwarderDone <- srv.RunForwarder(ctx, reporter) + }() + + conn, err := net.Dial("unix", socketPath) + require.NoError(t, err) + defer conn.Close() + + // Phase 1: the first forward errors + sendLogs(t, conn, makeReq(2)) + + select { + case <-reportNotify: + case <-time.After(testutil.WaitShort): + t.Fatal("timed out waiting for forward attempt") + } + + // The metric is incremented after ReportBoundaryLogs returns, so we + // need to poll briefly. + require.Eventually(t, func() bool { + return getCounterVecValue(t, reg, "agent_boundary_log_proxy_batches_dropped_total", "forward_failed") >= 1 + }, testutil.WaitShort, testutil.IntervalFast) + require.Equal(t, float64(2), + getCounterVecValue(t, reg, "agent_boundary_log_proxy_logs_dropped_total", "forward_failed")) + + // Phase 2: forward succeeds. + sendLogs(t, conn, makeReq(1)) + + require.Eventually(t, func() bool { + return len(reporter.getLogs()) >= 1 + }, testutil.WaitShort, testutil.IntervalFast) + require.Equal(t, float64(1), + getCounterValue(t, reg, "agent_boundary_log_proxy_batches_forwarded_total")) + + // Phase 3: boundary-reported drop counts arrive as a separate BoundaryStatus + // message, not piggybacked on log batches. + sendStatus(t, conn, &codec.BoundaryStatus{ + DroppedChannelFull: 5, + DroppedBatchFull: 3, + }) + + // Status is handled immediately by the reader goroutine, not by the + // forwarder, so poll metrics directly. + require.Eventually(t, func() bool { + return getCounterVecValue(t, reg, "agent_boundary_log_proxy_logs_dropped_total", "boundary_channel_full") >= 5 + }, testutil.WaitShort, testutil.IntervalFast) + require.Equal(t, float64(5), + getCounterVecValue(t, reg, "agent_boundary_log_proxy_logs_dropped_total", "boundary_channel_full")) + require.Equal(t, float64(3), + getCounterVecValue(t, reg, "agent_boundary_log_proxy_logs_dropped_total", "boundary_batch_full")) + + cancel() + <-forwarderDone + }) +} + +// getCounterVecValue returns the current value of a CounterVec metric filtered +// by the given reason label. +func getCounterVecValue(t *testing.T, reg *prometheus.Registry, name, reason string) float64 { + t.Helper() + + metrics, err := reg.Gather() + require.NoError(t, err) + + for _, mf := range metrics { + if mf.GetName() != name { + continue + } + for _, m := range mf.GetMetric() { + for _, lp := range m.GetLabel() { + if lp.GetName() == "reason" && lp.GetValue() == reason { + return m.GetCounter().GetValue() + } + } + } + } + + return 0 +} + +// getCounterValue returns the current value of a Counter metric. +func getCounterValue(t *testing.T, reg *prometheus.Registry, name string) float64 { + t.Helper() + + metrics, err := reg.Gather() + require.NoError(t, err) + + for _, mf := range metrics { + if mf.GetName() != name { + continue + } + for _, m := range mf.GetMetric() { + return m.GetCounter().GetValue() + } + } + + return 0 +} diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 7fed2747d3..2353b819ae 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -104,171 +104,174 @@ deployment. They will always be available from the agent. -| Name | Type | Description | Labels | -|-------------------------------------------------------------------------|-----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------| -| `agent_scripts_executed_total` | counter | Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. | `agent_name` `success` `template_name` `username` `workspace_name` | -| `coder_aibridged_circuit_breaker_rejects_total` | counter | Total number of requests rejected due to open circuit breaker. | `endpoint` `model` `provider` | -| `coder_aibridged_circuit_breaker_state` | gauge | Current state of the circuit breaker (0=closed, 0.5=half-open, 1=open). | `endpoint` `model` `provider` | -| `coder_aibridged_circuit_breaker_trips_total` | counter | Total number of times the circuit breaker transitioned to open state. | `endpoint` `model` `provider` | -| `coder_aibridged_injected_tool_invocations_total` | counter | The number of times an injected MCP tool was invoked by aibridge. | `model` `name` `provider` `server` | -| `coder_aibridged_interceptions_duration_seconds` | histogram | The total duration of intercepted requests, in seconds. The majority of this time will be the upstream processing of the request. aibridge has no control over upstream processing time, so it's just an illustrative metric. | `model` `provider` | -| `coder_aibridged_interceptions_inflight` | gauge | The number of intercepted requests which are being processed. | `model` `provider` `route` | -| `coder_aibridged_interceptions_total` | counter | The count of intercepted requests. | `initiator_id` `method` `model` `provider` `route` `status` | -| `coder_aibridged_non_injected_tool_selections_total` | counter | The number of times an AI model selected a tool to be invoked by the client. | `model` `name` `provider` | -| `coder_aibridged_passthrough_total` | counter | The count of requests which were not intercepted but passed through to the upstream. | `method` `provider` `route` | -| `coder_aibridged_prompts_total` | counter | The number of prompts issued by users (initiators). | `initiator_id` `model` `provider` | -| `coder_aibridged_tokens_total` | counter | The number of tokens used by intercepted requests. | `initiator_id` `model` `provider` `type` | -| `coder_aibridgeproxyd_connect_sessions_total` | counter | Total number of CONNECT sessions established. | `type` | -| `coder_aibridgeproxyd_inflight_mitm_requests` | gauge | Number of MITM requests currently being processed. | `provider` | -| `coder_aibridgeproxyd_mitm_requests_total` | counter | Total number of MITM requests handled by the proxy. | `provider` | -| `coder_aibridgeproxyd_mitm_responses_total` | counter | Total number of MITM responses by HTTP status code class. | `code` `provider` | -| `coder_pubsub_connected` | gauge | Whether we are connected (1) or not connected (0) to postgres | | -| `coder_pubsub_current_events` | gauge | The current number of pubsub event channels listened for | | -| `coder_pubsub_current_subscribers` | gauge | The current number of active pubsub subscribers | | -| `coder_pubsub_disconnections_total` | counter | Total number of times we disconnected unexpectedly from postgres | | -| `coder_pubsub_latency_measure_errs_total` | counter | The number of pubsub latency measurement failures | | -| `coder_pubsub_latency_measures_total` | counter | The number of pubsub latency measurements | | -| `coder_pubsub_messages_total` | counter | Total number of messages received from postgres | `size` | -| `coder_pubsub_published_bytes_total` | counter | Total number of bytes successfully published across all publishes | | -| `coder_pubsub_publishes_total` | counter | Total number of calls to Publish | `success` | -| `coder_pubsub_receive_latency_seconds` | gauge | The time taken to receive a message from a pubsub event channel | | -| `coder_pubsub_received_bytes_total` | counter | Total number of bytes received across all messages | | -| `coder_pubsub_send_latency_seconds` | gauge | The time taken to send a message into a pubsub event channel | | -| `coder_pubsub_subscribes_total` | counter | Total number of calls to Subscribe/SubscribeWithErr | `success` | -| `coder_servertailnet_connections_total` | counter | Total number of TCP connections made to workspace agents. | `network` | -| `coder_servertailnet_open_connections` | gauge | Total number of TCP connections currently open to workspace agents. | `network` | -| `coderd_agentapi_metadata_batch_size` | histogram | Total number of metadata entries in each batch, updated before flushes. | | -| `coderd_agentapi_metadata_batch_utilization` | histogram | Number of metadata keys per agent in each batch, updated before flushes. | | -| `coderd_agentapi_metadata_batches_total` | counter | Total number of metadata batches flushed. | `reason` | -| `coderd_agentapi_metadata_dropped_keys_total` | counter | Total number of metadata keys dropped due to capacity limits. | | -| `coderd_agentapi_metadata_flush_duration_seconds` | histogram | Time taken to flush metadata batch to database and pubsub. | `reason` | -| `coderd_agentapi_metadata_flushed_total` | counter | Total number of unique metadatas flushed. | | -| `coderd_agentapi_metadata_publish_errors_total` | counter | Total number of metadata batch pubsub publish calls that have resulted in an error. | | -| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | -| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `template_version` `username` `workspace_name` | -| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` | -| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_startup_script_seconds` | gauge | The number of seconds the startup script took to execute. | `agent_name` `success` `template_name` `username` `workspace_name` | -| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | `method` `path` | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | `path` | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_total_user_count` | gauge | The total number of registered users, partitioned by status. | `status` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build` | gauge | The current number of workspace builds by status for all non-deleted workspaces. | `status` | -| `coderd_authz_authorize_duration_seconds` | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed. | `allowed` | -| `coderd_authz_prepare_authorize_duration_seconds` | histogram | Duration of the 'PrepareAuthorize' call in seconds. | | -| `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` | -| `coderd_db_query_latencies_seconds` | histogram | Latency distribution of queries in seconds. | `query` | -| `coderd_db_tx_duration_seconds` | histogram | Duration of transactions in seconds. | `success` `tx_id` | -| `coderd_db_tx_executions_count` | counter | Total count of transactions executed. 'retries' is expected to be 0 for a successful transaction. | `retries` `success` `tx_id` | -| `coderd_dbpurge_iteration_duration_seconds` | histogram | Duration of each dbpurge iteration in seconds. | `success` | -| `coderd_dbpurge_records_purged_total` | counter | Total number of records purged by type. | `record_type` | -| `coderd_experiments` | gauge | Indicates whether each experiment is enabled (1) or not (0) | `experiment` | -| `coderd_insights_applications_usage_seconds` | gauge | The application usage per template. | `application_name` `organization_name` `slug` `template_name` | -| `coderd_insights_parameters` | gauge | The parameter usage per template. | `organization_name` `parameter_name` `parameter_type` `parameter_value` `template_name` | -| `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `organization_name` `template_name` | -| `coderd_license_active_users` | gauge | The number of active users. | | -| `coderd_license_errors` | gauge | The number of active license errors. | | -| `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | -| `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | -| `coderd_license_warnings` | gauge | The number of active license warnings. | | -| `coderd_lifecycle_autobuild_execution_duration_seconds` | histogram | Duration of each autobuild execution. | | -| `coderd_notifications_dispatcher_send_seconds` | histogram | The time taken to dispatch notifications. | `method` | -| `coderd_notifications_inflight_dispatches` | gauge | The number of dispatch attempts which are currently in progress. | `method` `notification_template_id` | -| `coderd_notifications_pending_updates` | gauge | The number of dispatch attempt results waiting to be flushed to the store. | | -| `coderd_notifications_queued_seconds` | histogram | The time elapsed between a notification being enqueued in the store and retrieved for dispatching (measures the latency of the notifications system). This should generally be within CODER_NOTIFICATIONS_FETCH_INTERVAL seconds; higher values for a sustained period indicates delayed processing and CODER_NOTIFICATIONS_LEASE_COUNT can be increased to accommodate this. | `method` | -| `coderd_notifications_retry_count` | counter | The count of notification dispatch retry attempts. | `method` `notification_template_id` | -| `coderd_notifications_synced_updates_total` | counter | The number of dispatch attempt results flushed to the store. | | -| `coderd_oauth2_external_requests_rate_limit` | gauge | The total number of allowed requests per interval. | `name` `resource` | -| `coderd_oauth2_external_requests_rate_limit_next_reset_unix` | gauge | Unix timestamp for when the next interval starts | `name` `resource` | -| `coderd_oauth2_external_requests_rate_limit_remaining` | gauge | The remaining number of allowed requests in this interval. | `name` `resource` | -| `coderd_oauth2_external_requests_rate_limit_reset_in_seconds` | gauge | Seconds until the next interval | `name` `resource` | -| `coderd_oauth2_external_requests_rate_limit_used` | gauge | The number of requests made in this interval. | `name` `resource` | -| `coderd_oauth2_external_requests_total` | counter | The total number of api calls made to external oauth2 providers. 'status_code' will be 0 if the request failed with no response. | `name` `source` `status_code` | -| `coderd_open_file_refs_current` | gauge | The count of file references currently open in the file cache. Multiple references can be held for the same file. | | -| `coderd_open_file_refs_total` | counter | The total number of file references ever opened in the file cache. The 'hit' label indicates if the file was loaded from the cache. | `hit` | -| `coderd_open_files_current` | gauge | The count of unique files currently open in the file cache. | | -| `coderd_open_files_size_bytes_current` | gauge | The current amount of memory of all files currently open in the file cache. | | -| `coderd_open_files_size_bytes_total` | counter | The total amount of memory ever opened in the file cache. This number never decrements. | | -| `coderd_open_files_total` | counter | The total count of unique files ever opened in the file cache. | | -| `coderd_prebuilds_reconciliation_duration_seconds` | histogram | Duration of each prebuilds reconciliation cycle. | | -| `coderd_prebuilt_workspace_claim_duration_seconds` | histogram | Time to claim a prebuilt workspace by organization, template, and preset. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_claimed_total` | counter | Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_created_total` | counter | Total number of prebuilt workspaces that have been created to meet the desired instance count of each template preset. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_desired` | gauge | Target number of prebuilt workspaces that should be available for each template preset. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_eligible` | gauge | Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that have completed their build process with their agent reporting 'ready' status. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_failed_total` | counter | Total number of prebuilt workspaces that failed to build. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_metrics_last_updated` | gauge | The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached. | | -| `coderd_prebuilt_workspaces_preset_hard_limited` | gauge | Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_preset_validation_failed` | gauge | Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_reconciliation_paused` | gauge | Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused). | | -| `coderd_prebuilt_workspaces_resource_replacements_total` | counter | Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. In Terraform, drift on immutable attributes results in resource replacement. This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource would have been recreated when claiming, thus obviating the point of pre-provisioning. See https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement | `organization_name` `preset_name` `template_name` | -| `coderd_prebuilt_workspaces_running` | gauge | Current number of prebuilt workspaces that are in a running state. These workspaces have started successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible). | `organization_name` `preset_name` `template_name` | -| `coderd_prometheusmetrics_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | -| `coderd_prometheusmetrics_agentstats_execution_seconds` | histogram | Histogram for duration of agent stats metrics collection in seconds. | | -| `coderd_prometheusmetrics_metrics_aggregator_execution_cleanup_seconds` | histogram | Histogram for duration of metrics aggregator cleanup in seconds. | | -| `coderd_prometheusmetrics_metrics_aggregator_execution_update_seconds` | histogram | Histogram for duration of metrics aggregator update in seconds. | | -| `coderd_prometheusmetrics_metrics_aggregator_store_size` | gauge | The number of metrics stored in the aggregator | | -| `coderd_provisioner_job_queue_wait_seconds` | histogram | Time from job creation to acquisition by a provisioner daemon. | `build_reason` `job_type` `provisioner_type` `transition` | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_provisionerd_num_daemons` | gauge | The number of provisioner daemons. | | -| `coderd_provisionerd_workspace_build_timings_seconds` | histogram | The time taken for a workspace to build. | `status` `template_name` `template_version` `workspace_transition` | -| `coderd_proxyhealth_health_check_duration_seconds` | histogram | Histogram for duration of proxy health collection in seconds. | | -| `coderd_proxyhealth_health_check_results` | gauge | This endpoint returns a number to indicate the health status. -3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy) | `proxy_id` | -| `coderd_template_workspace_build_duration_seconds` | histogram | Duration from workspace build creation to agent ready, by template. | `is_prebuild` `organization_name` `status` `template_name` `transition` | -| `coderd_workspace_builds_enqueued_total` | counter | Total number of workspace build enqueue attempts. | `build_reason` `provisioner_type` `status` `transition` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `status` `template_name` `template_version` `workspace_name` `workspace_owner` `workspace_transition` | -| `coderd_workspace_creation_duration_seconds` | histogram | Time to create a workspace by organization, template, preset, and type (regular or prebuild). | `organization_name` `preset_name` `template_name` `type` | -| `coderd_workspace_creation_total` | counter | Total regular (non-prebuilt) workspace creations by organization, template, and preset. | `organization_name` `preset_name` `template_name` | -| `coderd_workspace_latest_build_status` | gauge | The current workspace statuses by template, transition, and owner for all non-deleted workspaces. | `status` `template_name` `template_version` `workspace_owner` `workspace_transition` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +|-------------------------------------------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------| +| `agent_boundary_log_proxy_batches_dropped_total` | counter | Total number of boundary log batches dropped before reaching coderd. Reason: buffer_full = the agent's internal buffer is full, meaning boundary is producing logs faster than the agent can forward them to coderd; forward_failed = the agent failed to send the batch to coderd, potentially because coderd is unreachable or the connection was interrupted. | `reason` | +| `agent_boundary_log_proxy_batches_forwarded_total` | counter | Total number of boundary log batches successfully forwarded to coderd. Compare with batches_dropped_total to compute a drop rate. | | +| `agent_boundary_log_proxy_logs_dropped_total` | counter | Total number of individual boundary log entries dropped before reaching coderd. Reason: buffer_full = the agent's internal buffer is full; forward_failed = the agent failed to send the batch to coderd; boundary_channel_full = boundary's internal send channel overflowed, meaning boundary is generating logs faster than it can batch and send them; boundary_batch_full = boundary's outgoing batch buffer overflowed after a failed flush, meaning boundary could not write to the agent's socket. | `reason` | +| `agent_scripts_executed_total` | counter | Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. | `agent_name` `success` `template_name` `username` `workspace_name` | +| `coder_aibridged_circuit_breaker_rejects_total` | counter | Total number of requests rejected due to open circuit breaker. | `endpoint` `model` `provider` | +| `coder_aibridged_circuit_breaker_state` | gauge | Current state of the circuit breaker (0=closed, 0.5=half-open, 1=open). | `endpoint` `model` `provider` | +| `coder_aibridged_circuit_breaker_trips_total` | counter | Total number of times the circuit breaker transitioned to open state. | `endpoint` `model` `provider` | +| `coder_aibridged_injected_tool_invocations_total` | counter | The number of times an injected MCP tool was invoked by aibridge. | `model` `name` `provider` `server` | +| `coder_aibridged_interceptions_duration_seconds` | histogram | The total duration of intercepted requests, in seconds. The majority of this time will be the upstream processing of the request. aibridge has no control over upstream processing time, so it's just an illustrative metric. | `model` `provider` | +| `coder_aibridged_interceptions_inflight` | gauge | The number of intercepted requests which are being processed. | `model` `provider` `route` | +| `coder_aibridged_interceptions_total` | counter | The count of intercepted requests. | `initiator_id` `method` `model` `provider` `route` `status` | +| `coder_aibridged_non_injected_tool_selections_total` | counter | The number of times an AI model selected a tool to be invoked by the client. | `model` `name` `provider` | +| `coder_aibridged_passthrough_total` | counter | The count of requests which were not intercepted but passed through to the upstream. | `method` `provider` `route` | +| `coder_aibridged_prompts_total` | counter | The number of prompts issued by users (initiators). | `initiator_id` `model` `provider` | +| `coder_aibridged_tokens_total` | counter | The number of tokens used by intercepted requests. | `initiator_id` `model` `provider` `type` | +| `coder_aibridgeproxyd_connect_sessions_total` | counter | Total number of CONNECT sessions established. | `type` | +| `coder_aibridgeproxyd_inflight_mitm_requests` | gauge | Number of MITM requests currently being processed. | `provider` | +| `coder_aibridgeproxyd_mitm_requests_total` | counter | Total number of MITM requests handled by the proxy. | `provider` | +| `coder_aibridgeproxyd_mitm_responses_total` | counter | Total number of MITM responses by HTTP status code class. | `code` `provider` | +| `coder_pubsub_connected` | gauge | Whether we are connected (1) or not connected (0) to postgres | | +| `coder_pubsub_current_events` | gauge | The current number of pubsub event channels listened for | | +| `coder_pubsub_current_subscribers` | gauge | The current number of active pubsub subscribers | | +| `coder_pubsub_disconnections_total` | counter | Total number of times we disconnected unexpectedly from postgres | | +| `coder_pubsub_latency_measure_errs_total` | counter | The number of pubsub latency measurement failures | | +| `coder_pubsub_latency_measures_total` | counter | The number of pubsub latency measurements | | +| `coder_pubsub_messages_total` | counter | Total number of messages received from postgres | `size` | +| `coder_pubsub_published_bytes_total` | counter | Total number of bytes successfully published across all publishes | | +| `coder_pubsub_publishes_total` | counter | Total number of calls to Publish | `success` | +| `coder_pubsub_receive_latency_seconds` | gauge | The time taken to receive a message from a pubsub event channel | | +| `coder_pubsub_received_bytes_total` | counter | Total number of bytes received across all messages | | +| `coder_pubsub_send_latency_seconds` | gauge | The time taken to send a message into a pubsub event channel | | +| `coder_pubsub_subscribes_total` | counter | Total number of calls to Subscribe/SubscribeWithErr | `success` | +| `coder_servertailnet_connections_total` | counter | Total number of TCP connections made to workspace agents. | `network` | +| `coder_servertailnet_open_connections` | gauge | Total number of TCP connections currently open to workspace agents. | `network` | +| `coderd_agentapi_metadata_batch_size` | histogram | Total number of metadata entries in each batch, updated before flushes. | | +| `coderd_agentapi_metadata_batch_utilization` | histogram | Number of metadata keys per agent in each batch, updated before flushes. | | +| `coderd_agentapi_metadata_batches_total` | counter | Total number of metadata batches flushed. | `reason` | +| `coderd_agentapi_metadata_dropped_keys_total` | counter | Total number of metadata keys dropped due to capacity limits. | | +| `coderd_agentapi_metadata_flush_duration_seconds` | histogram | Time taken to flush metadata batch to database and pubsub. | `reason` | +| `coderd_agentapi_metadata_flushed_total` | counter | Total number of unique metadatas flushed. | | +| `coderd_agentapi_metadata_publish_errors_total` | counter | Total number of metadata batch pubsub publish calls that have resulted in an error. | | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `template_version` `username` `workspace_name` | +| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` | +| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_startup_script_seconds` | gauge | The number of seconds the startup script took to execute. | `agent_name` `success` `template_name` `username` `workspace_name` | +| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | `method` `path` | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | `path` | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_total_user_count` | gauge | The total number of registered users, partitioned by status. | `status` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build` | gauge | The current number of workspace builds by status for all non-deleted workspaces. | `status` | +| `coderd_authz_authorize_duration_seconds` | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed. | `allowed` | +| `coderd_authz_prepare_authorize_duration_seconds` | histogram | Duration of the 'PrepareAuthorize' call in seconds. | | +| `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` | +| `coderd_db_query_latencies_seconds` | histogram | Latency distribution of queries in seconds. | `query` | +| `coderd_db_tx_duration_seconds` | histogram | Duration of transactions in seconds. | `success` `tx_id` | +| `coderd_db_tx_executions_count` | counter | Total count of transactions executed. 'retries' is expected to be 0 for a successful transaction. | `retries` `success` `tx_id` | +| `coderd_dbpurge_iteration_duration_seconds` | histogram | Duration of each dbpurge iteration in seconds. | `success` | +| `coderd_dbpurge_records_purged_total` | counter | Total number of records purged by type. | `record_type` | +| `coderd_experiments` | gauge | Indicates whether each experiment is enabled (1) or not (0) | `experiment` | +| `coderd_insights_applications_usage_seconds` | gauge | The application usage per template. | `application_name` `organization_name` `slug` `template_name` | +| `coderd_insights_parameters` | gauge | The parameter usage per template. | `organization_name` `parameter_name` `parameter_type` `parameter_value` `template_name` | +| `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `organization_name` `template_name` | +| `coderd_license_active_users` | gauge | The number of active users. | | +| `coderd_license_errors` | gauge | The number of active license errors. | | +| `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | +| `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | +| `coderd_license_warnings` | gauge | The number of active license warnings. | | +| `coderd_lifecycle_autobuild_execution_duration_seconds` | histogram | Duration of each autobuild execution. | | +| `coderd_notifications_dispatcher_send_seconds` | histogram | The time taken to dispatch notifications. | `method` | +| `coderd_notifications_inflight_dispatches` | gauge | The number of dispatch attempts which are currently in progress. | `method` `notification_template_id` | +| `coderd_notifications_pending_updates` | gauge | The number of dispatch attempt results waiting to be flushed to the store. | | +| `coderd_notifications_queued_seconds` | histogram | The time elapsed between a notification being enqueued in the store and retrieved for dispatching (measures the latency of the notifications system). This should generally be within CODER_NOTIFICATIONS_FETCH_INTERVAL seconds; higher values for a sustained period indicates delayed processing and CODER_NOTIFICATIONS_LEASE_COUNT can be increased to accommodate this. | `method` | +| `coderd_notifications_retry_count` | counter | The count of notification dispatch retry attempts. | `method` `notification_template_id` | +| `coderd_notifications_synced_updates_total` | counter | The number of dispatch attempt results flushed to the store. | | +| `coderd_oauth2_external_requests_rate_limit` | gauge | The total number of allowed requests per interval. | `name` `resource` | +| `coderd_oauth2_external_requests_rate_limit_next_reset_unix` | gauge | Unix timestamp for when the next interval starts | `name` `resource` | +| `coderd_oauth2_external_requests_rate_limit_remaining` | gauge | The remaining number of allowed requests in this interval. | `name` `resource` | +| `coderd_oauth2_external_requests_rate_limit_reset_in_seconds` | gauge | Seconds until the next interval | `name` `resource` | +| `coderd_oauth2_external_requests_rate_limit_used` | gauge | The number of requests made in this interval. | `name` `resource` | +| `coderd_oauth2_external_requests_total` | counter | The total number of api calls made to external oauth2 providers. 'status_code' will be 0 if the request failed with no response. | `name` `source` `status_code` | +| `coderd_open_file_refs_current` | gauge | The count of file references currently open in the file cache. Multiple references can be held for the same file. | | +| `coderd_open_file_refs_total` | counter | The total number of file references ever opened in the file cache. The 'hit' label indicates if the file was loaded from the cache. | `hit` | +| `coderd_open_files_current` | gauge | The count of unique files currently open in the file cache. | | +| `coderd_open_files_size_bytes_current` | gauge | The current amount of memory of all files currently open in the file cache. | | +| `coderd_open_files_size_bytes_total` | counter | The total amount of memory ever opened in the file cache. This number never decrements. | | +| `coderd_open_files_total` | counter | The total count of unique files ever opened in the file cache. | | +| `coderd_prebuilds_reconciliation_duration_seconds` | histogram | Duration of each prebuilds reconciliation cycle. | | +| `coderd_prebuilt_workspace_claim_duration_seconds` | histogram | Time to claim a prebuilt workspace by organization, template, and preset. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_claimed_total` | counter | Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_created_total` | counter | Total number of prebuilt workspaces that have been created to meet the desired instance count of each template preset. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_desired` | gauge | Target number of prebuilt workspaces that should be available for each template preset. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_eligible` | gauge | Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that have completed their build process with their agent reporting 'ready' status. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_failed_total` | counter | Total number of prebuilt workspaces that failed to build. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_metrics_last_updated` | gauge | The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached. | | +| `coderd_prebuilt_workspaces_preset_hard_limited` | gauge | Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_preset_validation_failed` | gauge | Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_reconciliation_paused` | gauge | Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused). | | +| `coderd_prebuilt_workspaces_resource_replacements_total` | counter | Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. In Terraform, drift on immutable attributes results in resource replacement. This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource would have been recreated when claiming, thus obviating the point of pre-provisioning. See https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_running` | gauge | Current number of prebuilt workspaces that are in a running state. These workspaces have started successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible). | `organization_name` `preset_name` `template_name` | +| `coderd_prometheusmetrics_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_prometheusmetrics_agentstats_execution_seconds` | histogram | Histogram for duration of agent stats metrics collection in seconds. | | +| `coderd_prometheusmetrics_metrics_aggregator_execution_cleanup_seconds` | histogram | Histogram for duration of metrics aggregator cleanup in seconds. | | +| `coderd_prometheusmetrics_metrics_aggregator_execution_update_seconds` | histogram | Histogram for duration of metrics aggregator update in seconds. | | +| `coderd_prometheusmetrics_metrics_aggregator_store_size` | gauge | The number of metrics stored in the aggregator | | +| `coderd_provisioner_job_queue_wait_seconds` | histogram | Time from job creation to acquisition by a provisioner daemon. | `build_reason` `job_type` `provisioner_type` `transition` | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_provisionerd_num_daemons` | gauge | The number of provisioner daemons. | | +| `coderd_provisionerd_workspace_build_timings_seconds` | histogram | The time taken for a workspace to build. | `status` `template_name` `template_version` `workspace_transition` | +| `coderd_proxyhealth_health_check_duration_seconds` | histogram | Histogram for duration of proxy health collection in seconds. | | +| `coderd_proxyhealth_health_check_results` | gauge | This endpoint returns a number to indicate the health status. -3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy) | `proxy_id` | +| `coderd_template_workspace_build_duration_seconds` | histogram | Duration from workspace build creation to agent ready, by template. | `is_prebuild` `organization_name` `status` `template_name` `transition` | +| `coderd_workspace_builds_enqueued_total` | counter | Total number of workspace build enqueue attempts. | `build_reason` `provisioner_type` `status` `transition` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `status` `template_name` `template_version` `workspace_name` `workspace_owner` `workspace_transition` | +| `coderd_workspace_creation_duration_seconds` | histogram | Time to create a workspace by organization, template, preset, and type (regular or prebuild). | `organization_name` `preset_name` `template_name` `type` | +| `coderd_workspace_creation_total` | counter | Total regular (non-prebuilt) workspace creations by organization, template, and preset. | `organization_name` `preset_name` `template_name` | +| `coderd_workspace_latest_build_status` | gauge | The current workspace statuses by template, transition, and owner for all non-deleted workspaces. | `status` `template_name` `template_version` `workspace_owner` `workspace_transition` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index 9eae5000ef..ce024a0a66 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -1,3 +1,12 @@ +# HELP agent_boundary_log_proxy_batches_dropped_total Total number of boundary log batches dropped before reaching coderd. Reason: buffer_full = the agent's internal buffer is full, meaning boundary is producing logs faster than the agent can forward them to coderd; forward_failed = the agent failed to send the batch to coderd, potentially because coderd is unreachable or the connection was interrupted. +# TYPE agent_boundary_log_proxy_batches_dropped_total counter +agent_boundary_log_proxy_batches_dropped_total{reason=""} 0 +# HELP agent_boundary_log_proxy_batches_forwarded_total Total number of boundary log batches successfully forwarded to coderd. Compare with batches_dropped_total to compute a drop rate. +# TYPE agent_boundary_log_proxy_batches_forwarded_total counter +agent_boundary_log_proxy_batches_forwarded_total 0 +# HELP agent_boundary_log_proxy_logs_dropped_total Total number of individual boundary log entries dropped before reaching coderd. Reason: buffer_full = the agent's internal buffer is full; forward_failed = the agent failed to send the batch to coderd; boundary_channel_full = boundary's internal send channel overflowed, meaning boundary is generating logs faster than it can batch and send them; boundary_batch_full = boundary's outgoing batch buffer overflowed after a failed flush, meaning boundary could not write to the agent's socket. +# TYPE agent_boundary_log_proxy_logs_dropped_total counter +agent_boundary_log_proxy_logs_dropped_total{reason=""} 0 # HELP coder_pubsub_connected Whether we are connected (1) or not connected (0) to postgres # TYPE coder_pubsub_connected gauge coder_pubsub_connected 0