mirror of
https://github.com/coder/coder.git
synced 2026-06-06 06:28:20 +00:00
3194bcfc9e
Provisioner steps broken into smaller granular actions. Changes: - `ExtractArchive` moved to `init` request (was in `configure`) - Writing `tfstate` moved to `plan` (was in `configure`) - Moved most plan/apply outputs to `GraphComplete`
284 lines
10 KiB
Go
284 lines
10 KiB
Go
package terraform
|
|
|
|
import (
|
|
"fmt"
|
|
"slices"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/cespare/xxhash/v2"
|
|
"google.golang.org/protobuf/types/known/timestamppb"
|
|
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbtime"
|
|
"github.com/coder/coder/v2/provisionersdk/proto"
|
|
)
|
|
|
|
type timingKind string
|
|
|
|
// Copied from https://github.com/hashicorp/terraform/blob/01c0480e77263933b2b086dc8d600a69f80fad2d/internal/command/jsonformat/renderer.go
|
|
// We cannot reference these because they're in an internal package.
|
|
const (
|
|
// Stage markers are used to denote the beginning and end of stages. Without
|
|
// these, only discrete events (i.e. resource changes) within stages can be
|
|
// measured, which may omit setup/teardown time or other unmeasured overhead.
|
|
timingStageStart timingKind = "stage_start"
|
|
timingStageEnd timingKind = "stage_end"
|
|
timingStageError timingKind = "stage_error"
|
|
|
|
timingApplyStart timingKind = "apply_start"
|
|
timingApplyProgress timingKind = "apply_progress"
|
|
timingApplyComplete timingKind = "apply_complete"
|
|
timingApplyErrored timingKind = "apply_errored"
|
|
timingProvisionStart timingKind = "provision_start"
|
|
timingProvisionProgress timingKind = "provision_progress"
|
|
timingProvisionComplete timingKind = "provision_complete"
|
|
timingProvisionErrored timingKind = "provision_errored"
|
|
timingRefreshStart timingKind = "refresh_start"
|
|
timingRefreshComplete timingKind = "refresh_complete"
|
|
// Ignored.
|
|
timingChangeSummary timingKind = "change_summary"
|
|
timingDiagnostic timingKind = "diagnostic"
|
|
timingPlannedChange timingKind = "planned_change"
|
|
timingOutputs timingKind = "outputs"
|
|
timingResourceDrift timingKind = "resource_drift"
|
|
timingVersion timingKind = "version"
|
|
// These are not part of message_types, but we want to track init/graph timings as well.
|
|
timingGraphStart timingKind = "graph_start"
|
|
timingGraphComplete timingKind = "graph_complete"
|
|
timingGraphErrored timingKind = "graph_errored"
|
|
// Other terraform log types which we ignore.
|
|
timingLog timingKind = "log"
|
|
timingInitOutput timingKind = "init_output"
|
|
)
|
|
|
|
// Source: https://github.com/hashicorp/terraform/blob/6b73f710f8152ef4808e4de5bdfb35314442f4a5/internal/command/views/init.go#L267-L321
|
|
type initMessageCode string
|
|
|
|
const (
|
|
initCopyingConfigurationMessage initMessageCode = "copying_configuration_message"
|
|
initEmptyMessage initMessageCode = "empty_message"
|
|
initOutputInitEmptyMessage initMessageCode = "output_init_empty_message"
|
|
initOutputInitSuccessMessage initMessageCode = "output_init_success_message"
|
|
initOutputInitSuccessCloudMessage initMessageCode = "output_init_success_cloud_message"
|
|
initOutputInitSuccessCLIMessage initMessageCode = "output_init_success_cli_message"
|
|
initOutputInitSuccessCLICloudMessage initMessageCode = "output_init_success_cli_cloud_message"
|
|
initUpgradingModulesMessage initMessageCode = "upgrading_modules_message"
|
|
initInitializingTerraformCloudMessage initMessageCode = "initializing_terraform_cloud_message"
|
|
initInitializingModulesMessage initMessageCode = "initializing_modules_message"
|
|
initInitializingBackendMessage initMessageCode = "initializing_backend_message"
|
|
initInitializingStateStoreMessage initMessageCode = "initializing_state_store_message"
|
|
initDefaultWorkspaceCreatedMessage initMessageCode = "default_workspace_created_message"
|
|
initInitializingProviderPluginMessage initMessageCode = "initializing_provider_plugin_message"
|
|
initLockInfo initMessageCode = "lock_info"
|
|
initDependenciesLockChangesInfo initMessageCode = "dependencies_lock_changes_info"
|
|
)
|
|
|
|
type timingAggregator struct {
|
|
stage database.ProvisionerJobTimingStage
|
|
|
|
// Protects the stateLookup map.
|
|
lookupMu sync.Mutex
|
|
stateLookup map[uint64]*timingSpan
|
|
}
|
|
|
|
type timingSpan struct {
|
|
kind timingKind
|
|
// messageCode is only present in `terraform init` timings.
|
|
messageCode initMessageCode
|
|
start, end time.Time
|
|
stage database.ProvisionerJobTimingStage
|
|
action, provider, resource string
|
|
state proto.TimingState
|
|
}
|
|
|
|
// newTimingAggregator creates a new aggregator which measures the duration of resource init/plan/apply actions; stage
|
|
// represents the stage of provisioning the timings are occurring within.
|
|
func newTimingAggregator(stage database.ProvisionerJobTimingStage) *timingAggregator {
|
|
return &timingAggregator{
|
|
stage: stage,
|
|
stateLookup: make(map[uint64]*timingSpan),
|
|
}
|
|
}
|
|
|
|
// ingest accepts a timing span at a certain timestamp and assigns it a state according to the kind of timing event.
|
|
// We memoize start & completion events, and then calculate their total duration in aggregate.
|
|
// We ignore progress events because we only care about the full duration of the action (delta between *_start and *_complete events).
|
|
func (t *timingAggregator) ingest(ts time.Time, s *timingSpan) {
|
|
if s == nil {
|
|
return
|
|
}
|
|
|
|
s.stage = t.stage
|
|
ts = dbtime.Time(ts.UTC())
|
|
|
|
switch s.kind {
|
|
case timingApplyStart, timingProvisionStart, timingRefreshStart, timingGraphStart, timingStageStart:
|
|
s.start = ts
|
|
s.state = proto.TimingState_STARTED
|
|
case timingApplyComplete, timingProvisionComplete, timingRefreshComplete, timingGraphComplete, timingStageEnd:
|
|
s.end = ts
|
|
s.state = proto.TimingState_COMPLETED
|
|
case timingApplyErrored, timingProvisionErrored, timingGraphErrored, timingStageError:
|
|
s.end = ts
|
|
s.state = proto.TimingState_FAILED
|
|
case timingInitOutput:
|
|
// init timings are based on the init message code.
|
|
t.ingestInitTiming(ts, s)
|
|
return
|
|
default:
|
|
// We just want start/end timings, ignore all other events.
|
|
return
|
|
}
|
|
|
|
t.lookupMu.Lock()
|
|
// Memoize this span by its unique attributes and the determined state.
|
|
// This will be used in aggregate() to determine the duration of the resource action.
|
|
t.stateLookup[s.hashByState(s.state)] = s
|
|
t.lookupMu.Unlock()
|
|
}
|
|
|
|
// aggregate performs a pass through all memoized events to build up a slice of *proto.Timing instances which represent
|
|
// the total time taken to perform a certain action.
|
|
// The resulting slice of *proto.Timing is NOT sorted.
|
|
func (t *timingAggregator) aggregate() []*proto.Timing {
|
|
t.lookupMu.Lock()
|
|
defer t.lookupMu.Unlock()
|
|
|
|
// Pre-allocate len(measurements)/2 since each timing will have one STARTED and one FAILED/COMPLETED entry.
|
|
out := make([]*proto.Timing, 0, len(t.stateLookup)/2)
|
|
|
|
for _, s := range t.stateLookup {
|
|
// We are only concerned here with failed or completed events.
|
|
if s.state != proto.TimingState_FAILED && s.state != proto.TimingState_COMPLETED {
|
|
continue
|
|
}
|
|
|
|
// Look for a corresponding span for the STARTED state.
|
|
startSpan, ok := t.stateLookup[s.hashByState(proto.TimingState_STARTED)]
|
|
if !ok {
|
|
// Not found, we'll ignore this span.
|
|
continue
|
|
}
|
|
s.start = startSpan.start
|
|
|
|
// Until faster-than-light travel is a possibility, let's prevent this.
|
|
// Better to capture a zero delta than a negative one.
|
|
if s.start.After(s.end) {
|
|
s.start = s.end
|
|
}
|
|
|
|
// Let's only aggregate valid entries.
|
|
// Later we can add support for partial / failed applies, perhaps.
|
|
if s.start.IsZero() || s.end.IsZero() {
|
|
continue
|
|
}
|
|
|
|
out = append(out, s.toProto())
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
// startStage denotes the beginning of a stage and returns a function which
|
|
// should be called to mark the end of the stage. This is used to measure a
|
|
// stage's total duration across all it's discrete events and unmeasured
|
|
// overhead/events.
|
|
func (t *timingAggregator) startStage(stage database.ProvisionerJobTimingStage) (end func(err error)) {
|
|
ts := timingSpan{
|
|
kind: timingStageStart,
|
|
stage: stage,
|
|
resource: "coder_stage_" + string(stage),
|
|
action: "terraform",
|
|
provider: "coder",
|
|
}
|
|
endTs := ts
|
|
t.ingest(dbtime.Now(), &ts)
|
|
|
|
return func(err error) {
|
|
endTs.kind = timingStageEnd
|
|
if err != nil {
|
|
endTs.kind = timingStageError
|
|
}
|
|
t.ingest(dbtime.Now(), &endTs)
|
|
}
|
|
}
|
|
|
|
func (l timingKind) Valid() bool {
|
|
return slices.Contains([]timingKind{
|
|
timingStageStart,
|
|
timingStageEnd,
|
|
timingStageError,
|
|
timingApplyStart,
|
|
timingApplyProgress,
|
|
timingApplyComplete,
|
|
timingApplyErrored,
|
|
timingProvisionStart,
|
|
timingProvisionProgress,
|
|
timingProvisionComplete,
|
|
timingProvisionErrored,
|
|
timingRefreshStart,
|
|
timingRefreshComplete,
|
|
timingChangeSummary,
|
|
timingDiagnostic,
|
|
timingPlannedChange,
|
|
timingOutputs,
|
|
timingResourceDrift,
|
|
timingVersion,
|
|
timingGraphStart,
|
|
timingGraphComplete,
|
|
timingGraphErrored,
|
|
timingLog,
|
|
timingInitOutput,
|
|
}, l)
|
|
}
|
|
|
|
// Category returns the category for a giving timing state so that timings can be aggregated by this category.
|
|
// We can't use the state itself because we need an `apply_start` and an `apply_complete` to both hash to the same entry
|
|
// if all other attributes are identical.
|
|
func (l timingKind) Category() string {
|
|
switch l {
|
|
case timingStageStart, timingStageEnd, timingStageError:
|
|
return "stage"
|
|
case timingInitOutput:
|
|
return "init"
|
|
case timingGraphStart, timingGraphComplete, timingGraphErrored:
|
|
return "graph"
|
|
case timingApplyStart, timingApplyProgress, timingApplyComplete, timingApplyErrored:
|
|
return "apply"
|
|
case timingProvisionStart, timingProvisionProgress, timingProvisionComplete, timingProvisionErrored:
|
|
return "provision"
|
|
case timingRefreshStart, timingRefreshComplete:
|
|
return "state refresh"
|
|
default:
|
|
return "?"
|
|
}
|
|
}
|
|
|
|
// hashState computes a hash based on a timingSpan's unique properties and state.
|
|
// The combination of resource and provider names MUST be unique across entries.
|
|
func (e *timingSpan) hashByState(state proto.TimingState) uint64 {
|
|
id := fmt.Sprintf("%s:%s:%s:%s:%s", e.kind.Category(), state.String(), e.action, e.resource, e.provider)
|
|
if e.messageCode != "" {
|
|
id += ":" + string(e.messageCode)
|
|
}
|
|
return xxhash.Sum64String(id)
|
|
}
|
|
|
|
func (e *timingSpan) toProto() *proto.Timing {
|
|
// Some log entries, like state refreshes, don't have any "action" logged.
|
|
if e.action == "" {
|
|
e.action = e.kind.Category()
|
|
}
|
|
|
|
return &proto.Timing{
|
|
Start: timestamppb.New(e.start),
|
|
End: timestamppb.New(e.end),
|
|
Action: e.action,
|
|
Stage: string(e.stage),
|
|
Source: e.provider,
|
|
Resource: e.resource,
|
|
State: e.state,
|
|
}
|
|
}
|