diff --git a/acceptance/bundle/telemetry/deploy/out.telemetry.txt b/acceptance/bundle/telemetry/deploy/out.telemetry.txt index f945233dd16..3e867059b70 100644 --- a/acceptance/bundle/telemetry/deploy/out.telemetry.txt +++ b/acceptance/bundle/telemetry/deploy/out.telemetry.txt @@ -35,6 +35,26 @@ "[UUID]", "[UUID]" ], + "resources_metadata": { + "state_engine": "[ENGINE]", + "state_file_size_bytes": SMALL_INT, + "resources": [ + { + "resource_type": "jobs", + "count": 3, + "state_size_max_bytes": SMALL_INT, + "state_size_mean_bytes": SMALL_INT, + "state_size_median_bytes": SMALL_INT + }, + { + "resource_type": "pipelines", + "count": 2, + "state_size_max_bytes": SMALL_INT, + "state_size_mean_bytes": SMALL_INT, + "state_size_median_bytes": SMALL_INT + } + ] + }, "experimental": { "configuration_file_count": 1, "variable_count": 0, diff --git a/acceptance/bundle/telemetry/test.toml b/acceptance/bundle/telemetry/test.toml index 14453c07f92..deda1fb2aad 100644 --- a/acceptance/bundle/telemetry/test.toml +++ b/acceptance/bundle/telemetry/test.toml @@ -20,3 +20,28 @@ New = '[OS]' [[Repls]] Old = '"local_cache_measurements_ms": \[[^\]]*\]' New = '"local_cache_measurements_ms": [...redacted...]' + +# Normalize the deployment engine name in resources_metadata so the same +# fixture covers both DATABRICKS_BUNDLE_ENGINE=direct and =terraform runs. +[[Repls]] +Old = '"state_engine": "(direct|terraform)"' +New = '"state_engine": "[ENGINE]"' + +# Normalize byte-size measurements in resources_metadata to placeholders. +# Exact byte counts depend on resource state JSON formatting and would +# make these golden files brittle across SDK changes. +[[Repls]] +Old = '"state_file_size_bytes": \d+' +New = '"state_file_size_bytes": SMALL_INT' + +[[Repls]] +Old = '"state_size_max_bytes": \d+' +New = '"state_size_max_bytes": SMALL_INT' + +[[Repls]] +Old = '"state_size_mean_bytes": \d+' +New = '"state_size_mean_bytes": SMALL_INT' + +[[Repls]] +Old = '"state_size_median_bytes": \d+' +New = '"state_size_median_bytes": SMALL_INT' diff --git a/bundle/phases/resources_metadata.go b/bundle/phases/resources_metadata.go new file mode 100644 index 00000000000..1b9633a353a --- /dev/null +++ b/bundle/phases/resources_metadata.go @@ -0,0 +1,221 @@ +package phases + +import ( + "cmp" + "context" + "encoding/json" + "fmt" + "slices" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/direct/dresources" + "github.com/databricks/cli/bundle/direct/dstate" + "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/telemetry/protos" +) + +// collectResourcesMetadata builds a BundleResourcesMetadata for the deploy. +// +// State sizes are computed by running each resource's typed config through +// the direct engine's adapter.PrepareState — the same transformation direct +// uses to derive the value it persists to resources.json — and marshaling +// each entry with dstate.SaveState's encoding (MarshalIndent(" ", " ")). +// The whole-file size is then computed by assembling those entries into a +// dstate.Database and marshaling it the way DeploymentState.unlockedSave +// writes it (MarshalIndent("", " ")). So: +// +// - Under DATABRICKS_BUNDLE_ENGINE=direct, per-resource sizes equal +// len(entry.State) on disk byte-for-byte, and state_file_size_bytes +// matches the resources.json file size to within a few bytes (only +// Lineage and Serial may differ, which we set to "" / 0 here). +// - Under =terraform, the same computation runs against the bundle config, +// producing identical numbers for the same logical bundle. tfstate is +// never read. +// +// Returns nil when the bundle declares no resources. +func collectResourcesMetadata(ctx context.Context, b *bundle.Bundle) *protos.BundleResourcesMetadata { + counts, sizesByType, fileSize := collectResourceCountsAndSizes(ctx, b) + if len(counts) == 0 { + return nil + } + + types := unionKeys(counts, sizesByType) + slices.Sort(types) + + resources := make([]protos.ResourceMetadata, 0, len(types)) + for _, t := range types { + sizes := sizesByType[t] + slices.SortFunc(sizes, func(a, b int64) int { return cmp.Compare(a, b) }) + resources = append(resources, protos.ResourceMetadata{ + ResourceType: t, + Count: counts[t], + StateSizeMaxBytes: statMax(sizes), + StateSizeMeanBytes: statMean(sizes), + StateSizeMedianBytes: statMedian(sizes), + }) + } + + return &protos.BundleResourcesMetadata{ + StateEngine: resolveDeployEngine(ctx, b), + StateFileSizeBytes: fileSize, + Resources: resources, + } +} + +// collectResourceCountsAndSizes walks the bundle config and assembles a +// dstate.Database with each resource's PrepareState'd value, then marshals +// that database the way direct writes resources.json. Returns per-type +// counts, per-type per-resource byte lengths, and the byte length of the +// whole simulated state file. +func collectResourceCountsAndSizes(ctx context.Context, b *bundle.Bundle) (map[string]int64, map[string][]int64, int64) { + counts := make(map[string]int64) + sizesByType := make(map[string][]int64) + + adapters := getAdapters(ctx, b) + db := dstate.NewDatabase("", 0) + + pattern := dyn.NewPattern(dyn.Key("resources"), dyn.AnyKey(), dyn.AnyKey()) + _, err := dyn.MapByPattern(b.Config.Value(), pattern, func(p dyn.Path, v dyn.Value) (dyn.Value, error) { + if len(p) < 3 { + return v, nil + } + resourceType := p[1].Key() + counts[resourceType]++ + + stateBytes, err := resourceStateBytes(b, adapters, p, resourceType) + if err != nil { + log.Debugf(ctx, "resources-metadata telemetry: %s: %s", p, err) + return v, nil + } + sizesByType[resourceType] = append(sizesByType[resourceType], int64(len(stateBytes))) + db.State[p.String()] = dstate.ResourceEntry{ + ID: extractResourceID(v), + State: stateBytes, + } + return v, nil + }) + if err != nil { + log.Debugf(ctx, "resources-metadata telemetry: failed to walk config resources: %s", err) + } + + var fileSize int64 + if len(db.State) > 0 { + raw, mErr := json.MarshalIndent(db, "", " ") + if mErr != nil { + log.Debugf(ctx, "resources-metadata telemetry: failed to marshal database envelope: %s", mErr) + } else { + fileSize = int64(len(raw)) + } + } + return counts, sizesByType, fileSize +} + +// resourceStateBytes derives the bytes direct would store for one resource: +// GetResourceConfig (typed) → adapter.PrepareState → MarshalIndent with the +// same prefix/indent direct uses in dstate.SaveState. Falls back to marshaling +// the typed config when no adapter is registered for the resource type +// (e.g., a type the direct engine doesn't yet support). +func resourceStateBytes(b *bundle.Bundle, adapters map[string]*dresources.Adapter, p dyn.Path, resourceType string) ([]byte, error) { + cfg, err := b.Config.GetResourceConfig(p.String()) + if err != nil { + return nil, fmt.Errorf("get config: %w", err) + } + + target := cfg + if adapter, ok := adapters[resourceType]; ok { + state, err := adapter.PrepareState(cfg) + if err != nil { + return nil, fmt.Errorf("prepare state: %w", err) + } + target = state + } + + // dstate.SaveState writes resource state with MarshalIndent using these + // exact prefix/indent arguments; matching them here means each resource's + // byte length equals len(entry.State) on disk for direct deploys. + raw, err := json.MarshalIndent(target, " ", " ") + if err != nil { + return nil, fmt.Errorf("marshal: %w", err) + } + return raw, nil +} + +// extractResourceID returns the resource's ID string from its dyn.Value entry, +// or "" if not yet set. Each resources.. entry has an "id" field +// populated post-deploy (via BaseResource.ID). +func extractResourceID(v dyn.Value) string { + idVal, err := dyn.Get(v, "id") + if err != nil || idVal.Kind() != dyn.KindString { + return "" + } + return idVal.MustString() +} + +// getAdapters returns adapters initialized for PrepareState. If the bundle +// already has them initialized (direct engine path), reuse them. Otherwise, +// build a fresh set with a nil workspace client — PrepareState is a pure +// transformation that doesn't touch the client. +func getAdapters(ctx context.Context, b *bundle.Bundle) map[string]*dresources.Adapter { + if b.DeploymentBundle.Adapters != nil { + return b.DeploymentBundle.Adapters + } + adapters, err := dresources.InitAll(nil) + if err != nil { + log.Debugf(ctx, "resources-metadata telemetry: failed to init adapters: %s", err) + return nil + } + return adapters +} + +// resolveDeployEngine returns the effective deploy engine ("direct" or +// "terraform"). Mirrors cmd/bundle/utils.ResolveEngineSetting but is inlined +// here to avoid a layering import (bundle/phases must not depend on cmd/). +func resolveDeployEngine(ctx context.Context, b *bundle.Bundle) string { + if b.Config.Bundle.Engine != engine.EngineNotSet { + return string(b.Config.Bundle.Engine.ThisOrDefault()) + } + envEngine, _ := engine.FromEnv(ctx) + return string(envEngine.ThisOrDefault()) +} + +func unionKeys(a map[string]int64, b map[string][]int64) []string { + seen := make(map[string]struct{}, len(a)+len(b)) + for k := range a { + seen[k] = struct{}{} + } + for k := range b { + seen[k] = struct{}{} + } + out := make([]string, 0, len(seen)) + for k := range seen { + out = append(out, k) + } + return out +} + +func statMax(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + return sortedSizes[len(sortedSizes)-1] +} + +func statMean(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + var total int64 + for _, s := range sortedSizes { + total += s + } + return total / int64(len(sortedSizes)) +} + +func statMedian(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + return sortedSizes[(len(sortedSizes)-1)/2] +} diff --git a/bundle/phases/resources_metadata_test.go b/bundle/phases/resources_metadata_test.go new file mode 100644 index 00000000000..49c33156a34 --- /dev/null +++ b/bundle/phases/resources_metadata_test.go @@ -0,0 +1,140 @@ +package phases + +import ( + "testing" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/dyn/convert" + "github.com/databricks/cli/libs/env" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func bundleWithResources(t *testing.T, resources map[string]map[string]any) *bundle.Bundle { + t.Helper() + b := &bundle.Bundle{} + err := b.Config.Mutate(func(v dyn.Value) (dyn.Value, error) { + typed := make(map[string]dyn.Value) + for resourceType, byName := range resources { + named := make(map[string]dyn.Value) + for name, r := range byName { + dv, err := convert.FromTyped(r, dyn.NilValue) + require.NoError(t, err) + named[name] = dv + } + typed[resourceType] = dyn.NewValue(named, nil) + } + return dyn.Set(v, "resources", dyn.NewValue(typed, nil)) + }) + require.NoError(t, err) + return b +} + +func TestCollectResourceCountsAndSizes_GroupsByType(t *testing.T) { + b := bundleWithResources(t, map[string]map[string]any{ + "jobs": { + "a": map[string]any{"name": "a"}, + "b": map[string]any{"name": "ab"}, + }, + "pipelines": { + "p": map[string]any{"name": "p1"}, + }, + }) + + counts, sizes, fileSize := collectResourceCountsAndSizes(t.Context(), b) + assert.Equal(t, map[string]int64{"jobs": 2, "pipelines": 1}, counts) + assert.Len(t, sizes["jobs"], 2) + assert.Len(t, sizes["pipelines"], 1) + // fileSize is the simulated dstate.Database byte length, which includes + // the envelope (state_version, cli_version, lineage, serial, the state + // map wrapper, and per-entry __id__) on top of each resource's bytes. + // So fileSize must exceed the per-resource sum. + var sum int64 + for _, list := range sizes { + for _, s := range list { + assert.Positive(t, s) + sum += s + } + } + assert.Greater(t, fileSize, sum, "fileSize should include envelope overhead beyond per-resource bytes") +} + +func TestCollectResourceCountsAndSizes_NoResources(t *testing.T) { + b := &bundle.Bundle{} + counts, sizes, total := collectResourceCountsAndSizes(t.Context(), b) + assert.Empty(t, counts) + assert.Empty(t, sizes) + assert.Equal(t, int64(0), total) +} + +func TestResolveDeployEngine(t *testing.T) { + cases := []struct { + name string + configEng engine.EngineType + envEng string + want string + }{ + {"config wins over env", engine.EngineDirect, "terraform", "direct"}, + {"env used when config unset", engine.EngineNotSet, "direct", "direct"}, + {"default when neither set", engine.EngineNotSet, "", "terraform"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + b := &bundle.Bundle{} + b.Config.Bundle.Engine = c.configEng + ctx := env.Set(t.Context(), engine.EnvVar, c.envEng) + assert.Equal(t, c.want, resolveDeployEngine(ctx, b)) + }) + } +} + +func TestCollectResourcesMetadata_DeterministicAndComparable(t *testing.T) { + // Same bundle config under both engines must produce identical metadata. + resources := map[string]map[string]any{ + "jobs": { + "a": map[string]any{"name": "alpha"}, + "b": map[string]any{"name": "beta"}, + "c": map[string]any{"name": "gamma"}, + }, + "pipelines": { + "p": map[string]any{"name": "pipe"}, + }, + } + bDirect := bundleWithResources(t, resources) + bDirect.Config.Bundle.Engine = engine.EngineDirect + bTerraform := bundleWithResources(t, resources) + bTerraform.Config.Bundle.Engine = engine.EngineTerraform + + mdDirect := collectResourcesMetadata(t.Context(), bDirect) + mdTerraform := collectResourcesMetadata(t.Context(), bTerraform) + + require.NotNil(t, mdDirect) + require.NotNil(t, mdTerraform) + assert.Equal(t, "direct", mdDirect.StateEngine) + assert.Equal(t, "terraform", mdTerraform.StateEngine) + assert.Equal(t, mdDirect.StateFileSizeBytes, mdTerraform.StateFileSizeBytes) + assert.Equal(t, mdDirect.Resources, mdTerraform.Resources) +} + +func TestCollectResourcesMetadata_ReturnsNilWhenNoResources(t *testing.T) { + b := &bundle.Bundle{} + assert.Nil(t, collectResourcesMetadata(t.Context(), b)) +} + +func TestStatHelpers(t *testing.T) { + assert.Equal(t, int64(3), statMax([]int64{1, 2, 3})) + assert.Equal(t, int64(2), statMean([]int64{1, 2, 3})) + assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3})) + // Lower-middle for even count: sorted [1,2,3,4] -> index (4-1)/2 = 1 -> 2. + assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3, 4})) + assert.Equal(t, int64(0), statMax(nil)) + assert.Equal(t, int64(0), statMean(nil)) + assert.Equal(t, int64(0), statMedian(nil)) +} + +func TestUnionKeys(t *testing.T) { + got := unionKeys(map[string]int64{"a": 1, "b": 2}, map[string][]int64{"b": nil, "c": nil}) + assert.ElementsMatch(t, []string{"a", "b", "c"}, got) +} diff --git a/bundle/phases/telemetry.go b/bundle/phases/telemetry.go index 6496b45921d..620e3e17f57 100644 --- a/bundle/phases/telemetry.go +++ b/bundle/phases/telemetry.go @@ -195,6 +195,8 @@ func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) { ResourceClusterIDs: clusterIds, ResourceDashboardIDs: dashboardIds, + ResourcesMetadata: collectResourcesMetadata(ctx, b), + Experimental: &protos.BundleDeployExperimental{ BundleMode: mode, ConfigurationFileCount: b.Metrics.ConfigurationFileCount, diff --git a/libs/telemetry/protos/bundle_deploy.go b/libs/telemetry/protos/bundle_deploy.go index d9439437d9b..a52b899dfb7 100644 --- a/libs/telemetry/protos/bundle_deploy.go +++ b/libs/telemetry/protos/bundle_deploy.go @@ -10,6 +10,9 @@ type BundleDeployEvent struct { // Error message encountered during the bundle deploy command, if any. ErrorMessage string `json:"error_message,omitempty"` + // Deprecated: use ResourcesMetadata.Resources[*].Count instead (per + // resource type; sum across types for the total). The new field derives + // counts from the bundle configuration at telemetry-emission time. ResourceCount int64 `json:"resource_count"` ResourceJobCount int64 `json:"resource_job_count"` ResourcePipelineCount int64 `json:"resource_pipeline_count"` @@ -32,6 +35,9 @@ type BundleDeployEvent struct { ResourceClusterIDs []string `json:"resource_cluster_ids,omitempty"` ResourceDashboardIDs []string `json:"resource_dashboard_ids,omitempty"` + // Per-resource-type metadata (counts and state-size statistics). + ResourcesMetadata *BundleResourcesMetadata `json:"resources_metadata,omitempty"` + Experimental *BundleDeployExperimental `json:"experimental,omitempty"` } @@ -88,6 +94,52 @@ type BundleDeployExperimental struct { LocalCacheMeasurementsMs []IntMapEntry `json:"local_cache_measurements_ms,omitempty"` } +// BundleResourcesMetadata mirrors the universe proto. Per-resource-type +// metadata for one bundle deployment, including counts (which replace the +// deprecated DatabricksBundleDeployEvent.resource_*_count fields) and +// state-size statistics. +// +// State sizes are computed by running each resource through the direct +// engine's adapter.PrepareState (the same transformation direct uses to +// derive the value it persists to resources.json) and marshaling with the +// same indented encoding used by dstate.SaveState. So: +// - Under DATABRICKS_BUNDLE_ENGINE=direct, each per-resource size equals +// len(entry.State) on disk byte-for-byte. +// - Under =terraform, the same computation runs against the bundle config, +// producing identical numbers for the same logical bundle. Tfstate is +// never read here. +type BundleResourcesMetadata struct { + // Effective deploy engine: "direct" or "terraform". + StateEngine string `json:"state_engine,omitempty"` + + // Size in bytes of the simulated deployment state file: each resource's + // prepared state assembled into a dstate.Database and marshaled the way + // direct writes resources.json (envelope included, so this exceeds the + // per-resource sum). For direct deploys it differs from the on-disk + // resources.json only by the Lineage and Serial fields, which are left + // empty/0 here. + StateFileSizeBytes int64 `json:"state_file_size_bytes,omitempty"` + + // One entry per resource type present in the bundle. + Resources []ResourceMetadata `json:"resources,omitempty"` +} + +// ResourceMetadata holds metadata about resources of a single type within one +// bundle deployment. +type ResourceMetadata struct { + // Resource type name: "jobs", "pipelines", "schemas", ... + ResourceType string `json:"resource_type,omitempty"` + + // Number of resources of this type declared in the bundle configuration. + Count int64 `json:"count,omitempty"` + + // Per-resource state-size statistics, computed via the direct engine's + // PrepareState transformation. Zero when no resources of this type exist. + StateSizeMaxBytes int64 `json:"state_size_max_bytes,omitempty"` + StateSizeMeanBytes int64 `json:"state_size_mean_bytes,omitempty"` + StateSizeMedianBytes int64 `json:"state_size_median_bytes,omitempty"` +} + type BoolMapEntry struct { Key string `json:"key,omitempty"` Value bool `json:"value"`