Skip to content

Commit 1f3aaaa

Browse files
authored
Implement agent_sandbox_claim_creation_total counter metric. (kubernetes-sigs#372)
1 parent 719301a commit 1f3aaaa

4 files changed

Lines changed: 220 additions & 7 deletions

File tree

extensions/controllers/sandboxclaim_controller.go

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import (
4646
// TODO: These constants should be imported from the main controller package Issue #216
4747
const (
4848
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
49+
poolNameNone = "none"
4950
)
5051

5152
// ErrTemplateNotFound is a sentinel error indicating a SandboxTemplate was not found.
@@ -340,7 +341,7 @@ func (r *SandboxClaimReconciler) computeAndSetStatus(claim *extensionsv1alpha1.S
340341
}
341342

342343
// tryAdoptPodFromPool attempts to find and adopt a pod from the warm pool
343-
func (r *SandboxClaimReconciler) tryAdoptPodFromPool(ctx context.Context, claim *extensionsv1alpha1.SandboxClaim, sandbox *v1alpha1.Sandbox) (*corev1.Pod, error) {
344+
func (r *SandboxClaimReconciler) tryAdoptPodFromPool(ctx context.Context, claim *extensionsv1alpha1.SandboxClaim, sandbox *v1alpha1.Sandbox) (*corev1.Pod, string, error) {
344345
log := log.FromContext(ctx)
345346

346347
// List all pods with the podTemplateHashLabel matching the hash
@@ -354,7 +355,7 @@ func (r *SandboxClaimReconciler) tryAdoptPodFromPool(ctx context.Context, claim
354355
Namespace: claim.Namespace,
355356
}); err != nil {
356357
log.Error(err, "Failed to list pods from warm pool")
357-
return nil, err
358+
return nil, poolNameNone, err
358359
}
359360

360361
// Filter pods and create a slice of pointers for sorting
@@ -381,15 +382,20 @@ func (r *SandboxClaimReconciler) tryAdoptPodFromPool(ctx context.Context, claim
381382

382383
if len(candidates) == 0 {
383384
log.Info("No available pods in warm pool (all pods are being deleted, owned by other controllers, or pool is empty)")
384-
return nil, nil
385+
return nil, poolNameNone, nil
385386
}
386387

387388
// Sort pods using podutils.ByLogging to select the best available pod.
388389
sort.Sort(podutils.ByLogging(candidates))
389390

390391
// Get the first available pod
391392
pod := candidates[0]
392-
log.Info("Adopting pod from warm pool", "pod", pod.Name)
393+
poolName := poolNameNone
394+
if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil {
395+
poolName = controllerRef.Name
396+
}
397+
398+
log.Info("Adopting pod from warm pool", "pod", pod.Name, "pool", poolName)
393399

394400
// Remove the pool labels
395401
delete(pod.Labels, poolLabel)
@@ -412,11 +418,11 @@ func (r *SandboxClaimReconciler) tryAdoptPodFromPool(ctx context.Context, claim
412418
// Update the pod
413419
if err := r.Update(ctx, pod); err != nil {
414420
log.Error(err, "Failed to update adopted pod")
415-
return nil, err
421+
return nil, poolNameNone, err
416422
}
417423

418424
log.Info("Successfully adopted pod from warm pool", "pod", pod.Name, "sandbox", sandbox.Name)
419-
return pod, nil
425+
return pod, poolName, nil
420426
}
421427

422428
func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *extensionsv1alpha1.SandboxClaim, template *extensionsv1alpha1.SandboxTemplate) (*v1alpha1.Sandbox, error) {
@@ -466,7 +472,7 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
466472
}
467473

468474
// Before creating the sandbox, try to adopt a pod from the warm pool
469-
adoptedPod, adoptErr := r.tryAdoptPodFromPool(ctx, claim, sandbox)
475+
adoptedPod, poolName, adoptErr := r.tryAdoptPodFromPool(ctx, claim, sandbox)
470476
if adoptErr != nil {
471477
logger.Error(adoptErr, "Failed to adopt pod from warm pool")
472478
return nil, adoptErr
@@ -492,6 +498,28 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
492498
r.Recorder.Event(claim, corev1.EventTypeNormal, "SandboxProvisioned", fmt.Sprintf("Created Sandbox %q", sandbox.Name))
493499
}
494500

501+
launchType := asmetrics.LaunchTypeCold
502+
podCondition := "not_ready"
503+
if adoptedPod != nil {
504+
launchType = asmetrics.LaunchTypeWarm
505+
506+
// Fetch the latest pod status to ensure accuracy
507+
latestPod := &corev1.Pod{}
508+
if err := r.Get(ctx, client.ObjectKeyFromObject(adoptedPod), latestPod); err == nil {
509+
adoptedPod = latestPod
510+
} else {
511+
logger.Error(err, "Failed to fetch latest pod status for metric recording", "pod", adoptedPod.Name)
512+
}
513+
514+
for _, cond := range adoptedPod.Status.Conditions {
515+
if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
516+
podCondition = "ready"
517+
break
518+
}
519+
}
520+
}
521+
asmetrics.RecordSandboxClaimCreation(claim.Namespace, claim.Spec.TemplateRef.Name, launchType, poolName, podCondition)
522+
495523
return sandbox, nil
496524
}
497525

extensions/controllers/sandboxclaim_controller_test.go

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,146 @@ func TestRecordCreationLatencyMetric(t *testing.T) {
969969
}
970970
}
971971

972+
func TestSandboxClaimCreationMetric(t *testing.T) {
973+
template := &extensionsv1alpha1.SandboxTemplate{
974+
ObjectMeta: metav1.ObjectMeta{Name: "test-template", Namespace: "default"},
975+
Spec: extensionsv1alpha1.SandboxTemplateSpec{
976+
PodTemplate: sandboxv1alpha1.PodTemplate{
977+
Spec: corev1.PodSpec{
978+
Containers: []corev1.Container{{Name: "test-container", Image: "test-image"}},
979+
},
980+
},
981+
},
982+
}
983+
984+
claim := &extensionsv1alpha1.SandboxClaim{
985+
ObjectMeta: metav1.ObjectMeta{Name: "test-claim", Namespace: "default", UID: "claim-uid"},
986+
Spec: extensionsv1alpha1.SandboxClaimSpec{TemplateRef: extensionsv1alpha1.SandboxTemplateRef{Name: "test-template"}},
987+
}
988+
989+
t.Run("Cold Start", func(t *testing.T) {
990+
asmetrics.SandboxClaimCreationTotal.Reset()
991+
scheme := newScheme(t)
992+
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(template, claim).WithStatusSubresource(claim).Build()
993+
reconciler := &SandboxClaimReconciler{
994+
Client: client,
995+
Scheme: scheme,
996+
Recorder: record.NewFakeRecorder(10),
997+
Tracer: asmetrics.NewNoOp(),
998+
}
999+
1000+
req := reconcile.Request{NamespacedName: types.NamespacedName{Name: claim.Name, Namespace: "default"}}
1001+
_, err := reconciler.Reconcile(context.Background(), req)
1002+
if err != nil {
1003+
t.Fatalf("reconcile failed: %v", err)
1004+
}
1005+
1006+
// Verify metric
1007+
val := testutil.ToFloat64(asmetrics.SandboxClaimCreationTotal.WithLabelValues("default", "test-template", asmetrics.LaunchTypeCold, "none", "not_ready"))
1008+
if val != 1 {
1009+
t.Errorf("expected metric count 1, got %v", val)
1010+
}
1011+
})
1012+
1013+
t.Run("Warm Start", func(t *testing.T) {
1014+
asmetrics.SandboxClaimCreationTotal.Reset()
1015+
1016+
// Create a warm pool pod
1017+
poolNameHash := sandboxcontrollers.NameHash("test-pool")
1018+
warmPod := &corev1.Pod{
1019+
ObjectMeta: metav1.ObjectMeta{
1020+
Name: "warm-pod",
1021+
Namespace: "default",
1022+
Labels: map[string]string{
1023+
poolLabel: poolNameHash,
1024+
sandboxTemplateRefHash: sandboxcontrollers.NameHash("test-template"),
1025+
},
1026+
OwnerReferences: []metav1.OwnerReference{
1027+
{
1028+
APIVersion: "extensions.agents.x-k8s.io/v1alpha1",
1029+
Kind: "SandboxWarmPool",
1030+
Name: "test-pool",
1031+
UID: "pool-uid",
1032+
Controller: ptr.To(true),
1033+
},
1034+
},
1035+
},
1036+
Status: corev1.PodStatus{Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}},
1037+
Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "c", Image: "i"}}},
1038+
}
1039+
1040+
scheme := newScheme(t)
1041+
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(template, claim, warmPod).WithStatusSubresource(claim).Build()
1042+
reconciler := &SandboxClaimReconciler{
1043+
Client: client,
1044+
Scheme: scheme,
1045+
Recorder: record.NewFakeRecorder(10),
1046+
Tracer: asmetrics.NewNoOp(),
1047+
}
1048+
1049+
req := reconcile.Request{NamespacedName: types.NamespacedName{Name: claim.Name, Namespace: "default"}}
1050+
_, err := reconciler.Reconcile(context.Background(), req)
1051+
if err != nil {
1052+
t.Fatalf("reconcile failed: %v", err)
1053+
}
1054+
1055+
// Verify metric
1056+
val := testutil.ToFloat64(asmetrics.SandboxClaimCreationTotal.WithLabelValues("default", "test-template", asmetrics.LaunchTypeWarm, "test-pool", "ready"))
1057+
if val != 1 {
1058+
t.Errorf("expected metric count 1, got %v", val)
1059+
}
1060+
})
1061+
1062+
t.Run("Warm Start Not Ready", func(t *testing.T) {
1063+
asmetrics.SandboxClaimCreationTotal.Reset()
1064+
1065+
// Create a warm pool pod that is NOT ready
1066+
poolNameHash := sandboxcontrollers.NameHash("test-pool")
1067+
warmPod := &corev1.Pod{
1068+
ObjectMeta: metav1.ObjectMeta{
1069+
Name: "warm-pod-not-ready",
1070+
Namespace: "default",
1071+
Labels: map[string]string{
1072+
poolLabel: poolNameHash,
1073+
sandboxTemplateRefHash: sandboxcontrollers.NameHash("test-template"),
1074+
},
1075+
OwnerReferences: []metav1.OwnerReference{
1076+
{
1077+
APIVersion: "extensions.agents.x-k8s.io/v1alpha1",
1078+
Kind: "SandboxWarmPool",
1079+
Name: "test-pool",
1080+
UID: "pool-uid",
1081+
Controller: ptr.To(true),
1082+
},
1083+
},
1084+
},
1085+
Status: corev1.PodStatus{Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionFalse}}},
1086+
Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "c", Image: "i"}}},
1087+
}
1088+
1089+
scheme := newScheme(t)
1090+
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(template, claim, warmPod).WithStatusSubresource(claim).Build()
1091+
reconciler := &SandboxClaimReconciler{
1092+
Client: client,
1093+
Scheme: scheme,
1094+
Recorder: record.NewFakeRecorder(10),
1095+
Tracer: asmetrics.NewNoOp(),
1096+
}
1097+
1098+
req := reconcile.Request{NamespacedName: types.NamespacedName{Name: claim.Name, Namespace: "default"}}
1099+
_, err := reconciler.Reconcile(context.Background(), req)
1100+
if err != nil {
1101+
t.Fatalf("reconcile failed: %v", err)
1102+
}
1103+
1104+
// Verify metric
1105+
val := testutil.ToFloat64(asmetrics.SandboxClaimCreationTotal.WithLabelValues("default", "test-template", asmetrics.LaunchTypeWarm, "test-pool", "not_ready"))
1106+
if val != 1 {
1107+
t.Errorf("expected metric count 1, got %v", val)
1108+
}
1109+
})
1110+
}
1111+
9721112
func newScheme(t *testing.T) *runtime.Scheme {
9731113
scheme := runtime.NewScheme()
9741114
if err := sandboxv1alpha1.AddToScheme(scheme); err != nil {

internal/metrics/metrics.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,36 @@ var (
4242
},
4343
[]string{"launch_type", "sandbox_template"},
4444
)
45+
46+
// SandboxClaimCreationTotal calculates the total number of SandboxClaims created.
47+
// Labels:
48+
// - namespace: the namespace of the claim
49+
// - sandbox_template: the SandboxTemplateRef
50+
// - launch_type: "warm", "cold", "unknown"
51+
// - warmpool_name: the name of the warm pool (if applicable)
52+
// - pod_condition: "ready", "not_ready"
53+
SandboxClaimCreationTotal = prometheus.NewCounterVec(
54+
prometheus.CounterOpts{
55+
Name: "agent_sandbox_claim_creation_total",
56+
Help: "Total number of SandboxClaims created, labeled by namespace, sandbox template, launch type, warmpool name, and pod condition.",
57+
},
58+
[]string{"namespace", "sandbox_template", "launch_type", "warmpool_name", "pod_condition"},
59+
)
4560
)
4661

4762
// Init registers custom metrics with the global controller-runtime registry.
4863
func init() {
4964
metrics.Registry.MustRegister(ClaimStartupLatency)
65+
metrics.Registry.MustRegister(SandboxClaimCreationTotal)
5066
}
5167

5268
// RecordClaimStartupLatency records the duration since the provided start time.
5369
func RecordClaimStartupLatency(startTime time.Time, launchType, templateName string) {
5470
duration := float64(time.Since(startTime).Milliseconds())
5571
ClaimStartupLatency.WithLabelValues(launchType, templateName).Observe(duration)
5672
}
73+
74+
// RecordSandboxClaimCreation increments the total count of created sandbox claims.
75+
func RecordSandboxClaimCreation(namespace, templateName, launchType, warmPoolName, podCondition string) {
76+
SandboxClaimCreationTotal.WithLabelValues(namespace, templateName, launchType, warmPoolName, podCondition).Inc()
77+
}

internal/metrics/metrics_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,27 @@ func TestClaimLatencyRecording(t *testing.T) {
4242
})
4343
}
4444
}
45+
46+
func TestSandboxClaimCreationRecording(t *testing.T) {
47+
testCases := []struct {
48+
name string
49+
launchType string
50+
podCondition string
51+
}{
52+
{"WarmReady", LaunchTypeWarm, "ready"},
53+
{"WarmNotReady", LaunchTypeWarm, "not_ready"},
54+
{"Cold", LaunchTypeCold, "not_ready"},
55+
{"Unknown", LaunchTypeUnknown, "not_ready"},
56+
}
57+
58+
for _, tc := range testCases {
59+
t.Run(tc.name, func(t *testing.T) {
60+
SandboxClaimCreationTotal.Reset()
61+
SandboxClaimCreationTotal.WithLabelValues("default", "test-tmpl", tc.launchType, "test-pool", tc.podCondition).Inc()
62+
63+
if testutil.CollectAndCount(SandboxClaimCreationTotal) != 1 {
64+
t.Errorf("Expected 1 observation")
65+
}
66+
})
67+
}
68+
}

0 commit comments

Comments
 (0)