Skip to content
29 changes: 29 additions & 0 deletions api/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ package api
import (
"context"
"fmt"
"time"

"github.com/buildkite/go-pipeline"
"github.com/buildkite/roko"
)

// Job represents a Buildkite Agent API Job
Expand Down Expand Up @@ -160,6 +162,33 @@ func (c *Client) PromiseFailure(ctx context.Context, id string, req *JobPromiseF
return c.doRequest(r, nil)
}

// PromiseFailureWithRetry declares a promised failure for the job, retrying
// transient errors with the agent's standard backoff. It returns the HTTP
// status of the most recent response (0 if none was received, e.g. a network
// error) and an error describing any failure. Retry attempts are logged via
// warnf.
func (c *Client) PromiseFailureWithRetry(ctx context.Context, id string, req *JobPromiseFailureRequest, warnf func(string, ...any)) (int, error) {
var statusCode int
err := roko.NewRetrier(
roko.WithMaxAttempts(10),
roko.WithStrategy(roko.ExponentialSubsecond(2*time.Second)),
).DoWithContext(ctx, func(r *roko.Retrier) error {
resp, err := c.PromiseFailure(ctx, id, req)
if resp != nil {
statusCode = resp.StatusCode
}
if BreakOnNonRetryable(r, resp, err) {
return err
}
if err != nil {
warnf("Couldn't declare promised failure for job %s: %s (%s)", id, err, r)
return err
}
return nil
})
return statusCode, err
}

// JobUpdateResponse is the response from updating a job
type JobUpdateResponse struct {
ID string `json:"id"`
Expand Down
125 changes: 82 additions & 43 deletions clicommand/job_promise_failure.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@ package clicommand

import (
"context"
"errors"
"fmt"
"net/http"
"os"
"slices"
"strconv"
"time"

"github.com/buildkite/agent/v3/api"
"github.com/buildkite/agent/v3/internal/redact"
"github.com/buildkite/roko"
"github.com/buildkite/agent/v3/internal/socket"
"github.com/buildkite/agent/v3/jobapi"
"github.com/buildkite/agent/v3/logger"
"github.com/urfave/cli"
)

Expand All @@ -31,7 +34,9 @@ reports a soft-failure status, the promised status is kept. Any other
reported failure is recorded as reported.

Repeated calls with the same exit status are idempotent. Declaring a
different exit status once one is already recorded is rejected.
different exit status once one is already recorded is rejected. The agent
debounces repeated calls locally, so each exit status is only declared to
the Buildkite API once per job, even if you call this on every test failure.

The command exits non-zero if the promise is not accepted (for example, if
the job is no longer running, or a different exit status was already
Expand All @@ -49,7 +54,6 @@ type JobPromiseFailureConfig struct {

ExitStatus string `cli:"arg:0" label:"exit status" validate:"required"`
Reason string `cli:"reason"`
Job string `cli:"job" validate:"required"`
RedactedVars []string `cli:"redacted-vars" normalize:"list"`
}

Expand All @@ -59,12 +63,6 @@ var JobPromiseFailureCommand = cli.Command{
Description: jobPromiseFailureHelpDescription,
Hidden: true, // hidden until the early-failure feature is generally available
Flags: slices.Concat(globalFlags(), apiFlags(), []cli.Flag{
cli.StringFlag{
Name: "job",
Value: "",
Usage: "The job to declare an early failure for. Defaults to the current job",
EnvVar: "BUILDKITE_JOB_ID",
},
cli.StringFlag{
Name: "reason",
Value: "",
Expand All @@ -86,52 +84,93 @@ var JobPromiseFailureCommand = cli.Command{
return fmt.Errorf("exit status must be a positive integer: a promised failure cannot have a zero (successful) or negative exit status")
}

client := api.NewClient(l, loadAPIClientConfig(cfg, "AgentAccessToken"))
// Always target the current job (BUILDKITE_JOB_ID): the API requires a
// job token and rejects (403) any other job, and a promised failure only
// makes sense for the job that's running.
jobID := os.Getenv("BUILDKITE_JOB_ID")
if jobID == "" {
return fmt.Errorf("BUILDKITE_JOB_ID is not set: this command must be run from within a job")
}

needles, _, err := redact.NeedlesFromEnv(cfg.RedactedVars)
if err != nil {
return err
}
if redactedValue := redact.String(cfg.Reason, needles); redactedValue != cfg.Reason {
l.Warnf("The promise-failure reason for job %q contained one or more secrets from environment variables that have been redacted. If this is deliberate, pass --redacted-vars='' or a list of patterns that does not match the variable containing the secret", cfg.Job)
cfg.Reason = redactedValue
reason := cfg.Reason
if redactedValue := redact.String(reason, needles); redactedValue != reason {
l.Warnf("The promise-failure reason for job %q contained one or more secrets from environment variables that have been redacted. If this is deliberate, pass --redacted-vars='' or a list of patterns that does not match the variable containing the secret", jobID)
reason = redactedValue
}

req := &api.JobPromiseFailureRequest{
ExitStatus: exitStatus,
Reason: cfg.Reason,
// Prefer the Job API: it debounces repeated and concurrent calls (this
// may be called on every test failure) so the failure is declared at most
// once successfully, blocking for an accurate result. Declare directly
// only when the Job API can't be used (--no-job-api, or old Windows
// without Unix sockets) or can't be reached.
client, err := jobapi.NewDefaultClient(ctx)
if err != nil {
l.Debugf("Job API unavailable, declaring promised failure directly: %v", err)
return declarePromiseFailureDirectly(ctx, l, cfg, jobID, exitStatus, reason)
}

err = roko.NewRetrier(
roko.WithMaxAttempts(10),
roko.WithStrategy(roko.ExponentialSubsecond(2*time.Second)),
).DoWithContext(ctx, func(r *roko.Retrier) error {
resp, err := client.PromiseFailure(ctx, cfg.Job, req)
if api.BreakOnNonRetryable(r, resp, err) {
return err
}
if err != nil {
l.Warnf("%s (%s)", err, r)
return err
outcome, err := client.DeclarePromiseFailure(ctx, exitStatus, reason)
if err == nil {
if outcome == jobapi.PromiseFailureDebounced {
// Log at debug to avoid spamming job logs on repeated calls.
l.Debugf("Promised exit status %d already declared for job %s (debounced)", exitStatus, jobID)
} else {
l.Infof("Declared promised exit status %d for job %s", exitStatus, jobID)
}
return nil
})
if err != nil {
// The promise wasn't accepted. Exit non-zero so the outcome is
// visible to scripts; callers who consider a given case acceptable
// can append '|| true'.
switch {
case api.IsErrHavingStatus(err, http.StatusConflict):
return fmt.Errorf("a different promised exit status has already been declared for this job: %w", err)

case api.IsErrHavingStatus(err, http.StatusUnprocessableEntity):
return fmt.Errorf("the job is no longer running and cannot accept a promised failure: %w", err)
}
}

return fmt.Errorf("failed to declare promised job failure: %w", err)
// The Job API returned a definitive HTTP error: the Buildkite API
// rejected the declaration (409, 422) or was unreachable after retries
// (502). Surface it rather than declaring again.
var apiErr socket.APIErr
if errors.As(err, &apiErr) {
return promiseFailureError(apiErr.StatusCode, err)
}

l.Infof("Declared promised exit status %d for job %s", exitStatus, cfg.Job)
return nil
// We couldn't reach the Job API (or its response was lost). Declare
// directly so the promise still lands; the endpoint is idempotent for the
// same exit status, so a duplicate is safe.
l.Warnf("Couldn't reach the Job API to declare the promised failure; declaring it directly: %v", err)
return declarePromiseFailureDirectly(ctx, l, cfg, jobID, exitStatus, reason)
},
}

// promiseFailureError wraps err with a human-readable message for the Buildkite
// API status code. The command exits non-zero so the failure is visible to
// scripts, which can append '|| true' to ignore it.
func promiseFailureError(status int, err error) error {
switch status {
case http.StatusConflict:
return fmt.Errorf("a different promised exit status has already been declared for this job: %w", err)

case http.StatusUnprocessableEntity:
return fmt.Errorf("the job is no longer running and cannot accept a promised failure: %w", err)
}

return fmt.Errorf("failed to declare promised job failure: %w", err)
}

// declarePromiseFailureDirectly declares a promised failure straight to the
// Buildkite API, without debouncing via the Job API. It's used as a fallback
// when the Job API can't be used or reached.
func declarePromiseFailureDirectly(ctx context.Context, l logger.Logger, cfg JobPromiseFailureConfig, jobID string, exitStatus int, reason string) error {
client := api.NewClient(l, loadAPIClientConfig(cfg, "AgentAccessToken"))

req := &api.JobPromiseFailureRequest{
ExitStatus: exitStatus,
Reason: reason,
}

status, err := client.PromiseFailureWithRetry(ctx, jobID, req, l.Warnf)
if err != nil {
return promiseFailureError(status, err)
}

l.Infof("Declared promised exit status %d for job %s", exitStatus, jobID)
return nil
}
28 changes: 27 additions & 1 deletion internal/job/api.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package job

import (
"context"
"fmt"

"github.com/buildkite/agent/v3/api"
"github.com/buildkite/agent/v3/internal/redact"
"github.com/buildkite/agent/v3/internal/socket"
"github.com/buildkite/agent/v3/jobapi"
"github.com/buildkite/agent/v3/logger"
)

// startJobAPI starts the job API server, iff the OS of the box supports it otherwise it returns a
Expand All @@ -26,7 +29,9 @@ We'll continue to run your job, but you won't be able to use the Job API`)
return cleanup, fmt.Errorf("creating job API socket path: %w", err)
}

jobAPIOpts := []jobapi.ServerOpts{}
jobAPIOpts := []jobapi.ServerOpts{
jobapi.WithPromiseFailureDeclarer(e.declarePromiseFailure),
}
if e.Debug {
jobAPIOpts = append(jobAPIOpts, jobapi.WithDebug())
}
Expand Down Expand Up @@ -69,3 +74,24 @@ We'll continue to run your job, but you won't be able to use the Job API`)
}
}, nil
}

// declarePromiseFailure declares a promised failure for the current job to the
// Buildkite API. The Job API server debounces calls, so this runs at most once
// per successfully-declared exit status. It returns the status code of the most
// recent API response (0 if none was received, e.g. a network error) and an
// error describing any failure.
func (e *Executor) declarePromiseFailure(ctx context.Context, exitStatus int, reason string) (int, error) {
// logger.Discard keeps the access token (in HTTP debug dumps) out of the job
// log; retry warnings still go to the shell logger.
apiClient := api.NewClient(logger.Discard, api.Config{
Endpoint: e.shell.Env.GetString("BUILDKITE_AGENT_ENDPOINT", ""),
Token: e.shell.Env.GetString("BUILDKITE_AGENT_ACCESS_TOKEN", ""),
})

req := &api.JobPromiseFailureRequest{
ExitStatus: exitStatus,
Reason: reason,
}

return apiClient.PromiseFailureWithRetry(ctx, e.JobID, req, e.shell.Warningf)
}
27 changes: 24 additions & 3 deletions jobapi/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ import (
)

const (
envURL = "http://job/api/current-job/v0/env"
workdirURL = "http://job/api/current-job/v0/workdir"
redactionsURL = "http://job/api/current-job/v0/redactions"
envURL = "http://job/api/current-job/v0/env"
workdirURL = "http://job/api/current-job/v0/workdir"
redactionsURL = "http://job/api/current-job/v0/redactions"
promiseFailureURL = "http://job/api/current-job/v0/promise-failure"
)

var (
Expand Down Expand Up @@ -121,3 +122,23 @@ func (c *Client) RedactionCreate(ctx context.Context, text string) (string, erro
}
return resp.Redacted, nil
}

// DeclarePromiseFailure asks the Job API to declare a promised failure with the
// given exit status and reason to the Buildkite API, blocking until it
// completes. The server debounces repeated and concurrent calls for the same
// exit status: concurrent callers share one in-flight call, and once it succeeds
// later calls return from the cache. On success it returns the outcome
// (PromiseFailureDeclared or PromiseFailureDebounced). A failed declaration is
// returned as a socket.APIErr carrying the HTTP status code (the Buildkite API's
// status when it responded, otherwise 502).
func (c *Client) DeclarePromiseFailure(ctx context.Context, exitStatus int, reason string) (string, error) {
req := PromiseFailureRequest{
ExitStatus: exitStatus,
Reason: reason,
}
var resp PromiseFailureResponse
if err := c.client.Do(ctx, http.MethodPost, promiseFailureURL, &req, &resp); err != nil {
return "", err
}
return resp.Outcome, nil
}
Loading