@@ -256,6 +256,7 @@ func (suite *ServiceAccountSuite) creteTestJob(ns, name, serviceAccount, node st
256256 "name" : name ,
257257 },
258258 "spec" : map [string ]any {
259+ "backoffLimit" : int64 (2 ),
259260 "template" : map [string ]any {
260261 "spec" : map [string ]any {
261262 "restartPolicy" : "Never" ,
@@ -290,6 +291,7 @@ func (suite *ServiceAccountSuite) creteTestJob(ns, name, serviceAccount, node st
290291 }, metav1.CreateOptions {})
291292}
292293
294+ //nolint:gocyclo
293295func (suite * ServiceAccountSuite ) waitForJobReady (duration time.Duration , ns , name string ) error {
294296 cli := suite .DynamicClient .Resource (jobGVR ).Namespace (ns )
295297
@@ -306,6 +308,24 @@ func (suite *ServiceAccountSuite) waitForJobReady(duration time.Duration, ns, na
306308 }
307309
308310 status := job .Object ["status" ].(map [string ]any )
311+
312+ // check if the job has been marked Failed (all backoff retries exhausted)
313+ if conditions , ok := status ["conditions" ].([]any ); ok {
314+ for _ , c := range conditions {
315+ cond , ok := c .(map [string ]any )
316+ if ! ok {
317+ continue
318+ }
319+
320+ if cond ["type" ] == "Failed" && cond ["status" ] == "True" {
321+ failed , _ := status ["failed" ].(int64 )
322+ podLogs := suite .getJobPodLogs (ctx , ns , name )
323+
324+ return fmt .Errorf ("job %s/%s exhausted retries (failed=%d)%s" , ns , name , failed , podLogs )
325+ }
326+ }
327+ }
328+
309329 if status ["succeeded" ] == nil || status ["succeeded" ].(int64 ) == 0 {
310330 return retry .ExpectedError (fmt .Errorf ("job %s/%s is not ready yet" , ns , name ))
311331 }
@@ -314,7 +334,44 @@ func (suite *ServiceAccountSuite) waitForJobReady(duration time.Duration, ns, na
314334 })
315335}
316336
337+ func (suite * ServiceAccountSuite ) getJobPodLogs (ctx context.Context , ns , jobName string ) string {
338+ pods , err := suite .Clientset .CoreV1 ().Pods (ns ).List (ctx , metav1.ListOptions {
339+ LabelSelector : "job-name=" + jobName ,
340+ })
341+ if err != nil {
342+ return fmt .Sprintf (": (failed to list pods: %v)" , err )
343+ }
344+
345+ if len (pods .Items ) == 0 {
346+ return ": (no pods found)"
347+ }
348+
349+ // pick the most recently created pod
350+ newest := pods .Items [0 ]
351+
352+ for _ , p := range pods .Items [1 :] {
353+ if p .CreationTimestamp .After (newest .CreationTimestamp .Time ) {
354+ newest = p
355+ }
356+ }
357+
358+ tailLines := int64 (50 )
359+
360+ req := suite .Clientset .CoreV1 ().Pods (ns ).GetLogs (newest .Name , & corev1.PodLogOptions {
361+ TailLines : & tailLines ,
362+ })
363+
364+ logs , err := req .DoRaw (ctx )
365+ if err != nil {
366+ return fmt .Sprintf (": pod %s (no logs: %v)" , newest .Name , err )
367+ }
368+
369+ return fmt .Sprintf (": pod %s logs:\n %s" , newest .Name , string (logs ))
370+ }
371+
317372// configureAPIAccess configures the API access feature on all control plane nodes.
373+ //
374+ //nolint:gocyclo
318375func (suite * ServiceAccountSuite ) configureAPIAccess (
319376 enabled bool ,
320377 allowedRoles []string ,
@@ -349,7 +406,7 @@ func (suite *ServiceAccountSuite) configureAPIAccess(
349406 }
350407 }
351408
352- if enabled { // wait for CRD and the Talos endpoint to be created
409+ if enabled { // wait for CRD, Talos endpoint service, and at least one ready endpoint
353410 return retry .Constant (30 * time .Second ).RetryWithContext (suite .ctx , func (ctx context.Context ) error {
354411 _ , err := suite .getCRD ()
355412 if err != nil {
@@ -358,12 +415,30 @@ func (suite *ServiceAccountSuite) configureAPIAccess(
358415
359416 _ , err = suite .Clientset .CoreV1 ().
360417 Services (constants .KubernetesTalosAPIServiceNamespace ).
361- Get (suite . ctx , constants .KubernetesTalosAPIServiceName , metav1.GetOptions {})
418+ Get (ctx , constants .KubernetesTalosAPIServiceName , metav1.GetOptions {})
362419 if err != nil {
363420 return retry .ExpectedError (err )
364421 }
365422
366- return nil
423+ slices , err := suite .Clientset .DiscoveryV1 ().
424+ EndpointSlices (constants .KubernetesTalosAPIServiceNamespace ).
425+ List (ctx , metav1.ListOptions {
426+ LabelSelector : "kubernetes.io/service-name=" + constants .KubernetesTalosAPIServiceName ,
427+ })
428+ if err != nil {
429+ return retry .ExpectedError (err )
430+ }
431+
432+ for _ , slice := range slices .Items {
433+ for _ , ep := range slice .Endpoints {
434+ if len (ep .Addresses ) > 0 && (ep .Conditions .Ready == nil || * ep .Conditions .Ready ) {
435+ return nil
436+ }
437+ }
438+ }
439+
440+ return retry .ExpectedError (fmt .Errorf ("service %s/%s has no ready endpoints" ,
441+ constants .KubernetesTalosAPIServiceNamespace , constants .KubernetesTalosAPIServiceName ))
367442 })
368443 }
369444
0 commit comments