Skip to content

Commit 0dbc1e5

Browse files
committed
chore(ci): fix flaky test
fix `api.ServiceAccountSuite/TestValid` test by waiting for the endpoint to have one ip atleast. Dump the job logs on failure. Signed-off-by: Noel Georgi <git@frezbo.dev>
1 parent b687a47 commit 0dbc1e5

1 file changed

Lines changed: 78 additions & 3 deletions

File tree

internal/integration/api/serviceaccount.go

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ func (suite *ServiceAccountSuite) creteTestJob(ns, name, serviceAccount, node st
256256
"name": name,
257257
},
258258
"spec": map[string]any{
259+
"backoffLimit": int64(2),
259260
"template": map[string]any{
260261
"spec": map[string]any{
261262
"restartPolicy": "Never",
@@ -290,6 +291,7 @@ func (suite *ServiceAccountSuite) creteTestJob(ns, name, serviceAccount, node st
290291
}, metav1.CreateOptions{})
291292
}
292293

294+
//nolint:gocyclo
293295
func (suite *ServiceAccountSuite) waitForJobReady(duration time.Duration, ns, name string) error {
294296
cli := suite.DynamicClient.Resource(jobGVR).Namespace(ns)
295297

@@ -306,6 +308,24 @@ func (suite *ServiceAccountSuite) waitForJobReady(duration time.Duration, ns, na
306308
}
307309

308310
status := job.Object["status"].(map[string]any)
311+
312+
// check if the job has been marked Failed (all backoff retries exhausted)
313+
if conditions, ok := status["conditions"].([]any); ok {
314+
for _, c := range conditions {
315+
cond, ok := c.(map[string]any)
316+
if !ok {
317+
continue
318+
}
319+
320+
if cond["type"] == "Failed" && cond["status"] == "True" {
321+
failed, _ := status["failed"].(int64)
322+
podLogs := suite.getJobPodLogs(ctx, ns, name)
323+
324+
return fmt.Errorf("job %s/%s exhausted retries (failed=%d)%s", ns, name, failed, podLogs)
325+
}
326+
}
327+
}
328+
309329
if status["succeeded"] == nil || status["succeeded"].(int64) == 0 {
310330
return retry.ExpectedError(fmt.Errorf("job %s/%s is not ready yet", ns, name))
311331
}
@@ -314,7 +334,44 @@ func (suite *ServiceAccountSuite) waitForJobReady(duration time.Duration, ns, na
314334
})
315335
}
316336

337+
func (suite *ServiceAccountSuite) getJobPodLogs(ctx context.Context, ns, jobName string) string {
338+
pods, err := suite.Clientset.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{
339+
LabelSelector: "job-name=" + jobName,
340+
})
341+
if err != nil {
342+
return fmt.Sprintf(": (failed to list pods: %v)", err)
343+
}
344+
345+
if len(pods.Items) == 0 {
346+
return ": (no pods found)"
347+
}
348+
349+
// pick the most recently created pod
350+
newest := pods.Items[0]
351+
352+
for _, p := range pods.Items[1:] {
353+
if p.CreationTimestamp.After(newest.CreationTimestamp.Time) {
354+
newest = p
355+
}
356+
}
357+
358+
tailLines := int64(50)
359+
360+
req := suite.Clientset.CoreV1().Pods(ns).GetLogs(newest.Name, &corev1.PodLogOptions{
361+
TailLines: &tailLines,
362+
})
363+
364+
logs, err := req.DoRaw(ctx)
365+
if err != nil {
366+
return fmt.Sprintf(": pod %s (no logs: %v)", newest.Name, err)
367+
}
368+
369+
return fmt.Sprintf(": pod %s logs:\n%s", newest.Name, string(logs))
370+
}
371+
317372
// configureAPIAccess configures the API access feature on all control plane nodes.
373+
//
374+
//nolint:gocyclo
318375
func (suite *ServiceAccountSuite) configureAPIAccess(
319376
enabled bool,
320377
allowedRoles []string,
@@ -349,7 +406,7 @@ func (suite *ServiceAccountSuite) configureAPIAccess(
349406
}
350407
}
351408

352-
if enabled { // wait for CRD and the Talos endpoint to be created
409+
if enabled { // wait for CRD, Talos endpoint service, and at least one ready endpoint
353410
return retry.Constant(30*time.Second).RetryWithContext(suite.ctx, func(ctx context.Context) error {
354411
_, err := suite.getCRD()
355412
if err != nil {
@@ -358,12 +415,30 @@ func (suite *ServiceAccountSuite) configureAPIAccess(
358415

359416
_, err = suite.Clientset.CoreV1().
360417
Services(constants.KubernetesTalosAPIServiceNamespace).
361-
Get(suite.ctx, constants.KubernetesTalosAPIServiceName, metav1.GetOptions{})
418+
Get(ctx, constants.KubernetesTalosAPIServiceName, metav1.GetOptions{})
362419
if err != nil {
363420
return retry.ExpectedError(err)
364421
}
365422

366-
return nil
423+
slices, err := suite.Clientset.DiscoveryV1().
424+
EndpointSlices(constants.KubernetesTalosAPIServiceNamespace).
425+
List(ctx, metav1.ListOptions{
426+
LabelSelector: "kubernetes.io/service-name=" + constants.KubernetesTalosAPIServiceName,
427+
})
428+
if err != nil {
429+
return retry.ExpectedError(err)
430+
}
431+
432+
for _, slice := range slices.Items {
433+
for _, ep := range slice.Endpoints {
434+
if len(ep.Addresses) > 0 && (ep.Conditions.Ready == nil || *ep.Conditions.Ready) {
435+
return nil
436+
}
437+
}
438+
}
439+
440+
return retry.ExpectedError(fmt.Errorf("service %s/%s has no ready endpoints",
441+
constants.KubernetesTalosAPIServiceNamespace, constants.KubernetesTalosAPIServiceName))
367442
})
368443
}
369444

0 commit comments

Comments
 (0)