Skip to content

Commit cb8abee

Browse files
kaovilaiclaude
andcommitted
Fix silent volume restoration failures for encrypted snapshots
Resolves velero-io/velero#3145 and velero-io/velero#9128 Previously, when restoring EBS snapshots with KMS encryption, the plugin would report success even when volume creation failed due to missing KMS permissions (kms:Decrypt, kms:ReEncrypt*, kms:CreateGrant). This created a silent failure scenario where Velero logs showed successful restoration but the volume was never actually created. Changes: - Add volume creation verification with polling in CreateVolumeFromSnapshot - Wait for volume to reach 'available' state before returning success - Enhanced error handling for KMS permission failures with actionable messages - Add configurable timeout (volumeCreationTimeout) and poll interval (volumeCreationPollInterval) - Comprehensive test coverage for new error handling and configuration The fix transforms silent failures into clear error messages, helping users quickly identify and resolve KMS permission issues during volume restoration. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 59a6774 commit cb8abee

2 files changed

Lines changed: 256 additions & 9 deletions

File tree

velero-plugin-for-aws/volume_snapshotter.go

Lines changed: 121 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"os"
2626
"regexp"
2727
"strings"
28+
"time"
2829

2930
"github.com/pkg/errors"
3031
"github.com/sirupsen/logrus"
@@ -37,8 +38,14 @@ import (
3738
)
3839

3940
const (
40-
regionKey = "region"
41-
ebsCSIDriver = "ebs.csi.aws.com"
41+
regionKey = "region"
42+
ebsCSIDriver = "ebs.csi.aws.com"
43+
volumeCreationTimeoutKey = "volumeCreationTimeout"
44+
volumeCreationPollIntervalKey = "volumeCreationPollInterval"
45+
46+
// Volume creation verification settings
47+
defaultVolumeCreationTimeout = 10 * time.Minute
48+
volumeStatusPollInterval = 15 * time.Second
4249
)
4350

4451
// iopsVolumeTypes is a set of AWS EBS volume types for which IOPS should
@@ -47,16 +54,18 @@ const (
4754
var iopsVolumeTypes = sets.NewString("io1", "io2")
4855

4956
type VolumeSnapshotter struct {
50-
log logrus.FieldLogger
51-
ec2 *ec2.Client
57+
log logrus.FieldLogger
58+
ec2 *ec2.Client
59+
volumeCreationTimeout time.Duration
60+
volumePollInterval time.Duration
5261
}
5362

5463
func newVolumeSnapshotter(logger logrus.FieldLogger) *VolumeSnapshotter {
5564
return &VolumeSnapshotter{log: logger}
5665
}
5766

5867
func (b *VolumeSnapshotter) Init(config map[string]string) error {
59-
if err := veleroplugin.ValidateVolumeSnapshotterConfigKeys(config, regionKey, credentialProfileKey, credentialsFileKey, enableSharedConfigKey); err != nil {
68+
if err := veleroplugin.ValidateVolumeSnapshotterConfigKeys(config, regionKey, credentialProfileKey, credentialsFileKey, enableSharedConfigKey, volumeCreationTimeoutKey, volumeCreationPollIntervalKey); err != nil {
6069
return err
6170
}
6271

@@ -67,6 +76,27 @@ func (b *VolumeSnapshotter) Init(config map[string]string) error {
6776
if region == "" {
6877
return errors.Errorf("missing %s in aws configuration", regionKey)
6978
}
79+
80+
// Parse volume creation timeout
81+
b.volumeCreationTimeout = defaultVolumeCreationTimeout
82+
if timeoutStr := config[volumeCreationTimeoutKey]; timeoutStr != "" {
83+
if timeout, err := time.ParseDuration(timeoutStr); err != nil {
84+
return errors.Wrapf(err, "invalid %s duration format", volumeCreationTimeoutKey)
85+
} else {
86+
b.volumeCreationTimeout = timeout
87+
}
88+
}
89+
90+
// Parse volume poll interval
91+
b.volumePollInterval = volumeStatusPollInterval
92+
if intervalStr := config[volumeCreationPollIntervalKey]; intervalStr != "" {
93+
if interval, err := time.ParseDuration(intervalStr); err != nil {
94+
return errors.Wrapf(err, "invalid %s duration format", volumeCreationPollIntervalKey)
95+
} else {
96+
b.volumePollInterval = interval
97+
}
98+
}
99+
70100
cfg, err := newConfigBuilder(b.log).
71101
WithRegion(region).
72102
WithProfile(credentialProfile).
@@ -119,7 +149,15 @@ func (b *VolumeSnapshotter) CreateVolumeFromSnapshot(snapshotID, volumeType, vol
119149
return "", errors.WithStack(err)
120150
}
121151

122-
return *output.VolumeId, nil
152+
volumeID = *output.VolumeId
153+
154+
// Verify that the volume is actually created and available
155+
// This is critical for detecting KMS permission failures and other async errors
156+
if err := b.waitForVolumeAvailable(volumeID); err != nil {
157+
return "", errors.Wrapf(err, "volume creation failed for snapshot %s", snapshotID)
158+
}
159+
160+
return volumeID, nil
123161
}
124162

125163
func (b *VolumeSnapshotter) GetVolumeInfo(volumeID, volumeAZ string) (string, *int64, error) {
@@ -159,6 +197,83 @@ func (b *VolumeSnapshotter) describeVolume(volumeID string) (types.Volume, error
159197
return output.Volumes[0], nil
160198
}
161199

200+
// waitForVolumeAvailable polls the volume status until it becomes available or times out.
201+
// This is essential for detecting KMS permission failures and other async volume creation errors.
202+
func (b *VolumeSnapshotter) waitForVolumeAvailable(volumeID string) error {
203+
ctx, cancel := context.WithTimeout(context.Background(), b.volumeCreationTimeout)
204+
defer cancel()
205+
206+
b.log.WithFields(logrus.Fields{
207+
"volumeID": volumeID,
208+
"timeout": b.volumeCreationTimeout,
209+
"interval": b.volumePollInterval,
210+
}).Info("Waiting for volume to become available")
211+
212+
for {
213+
select {
214+
case <-ctx.Done():
215+
return errors.Errorf("timeout waiting for volume %s to become available after %v. Check AWS CloudTrail for detailed error information", volumeID, b.volumeCreationTimeout)
216+
default:
217+
}
218+
219+
volume, err := b.describeVolume(volumeID)
220+
if err != nil {
221+
// Check if volume doesn't exist yet (still being created)
222+
var apiErr smithy.APIError
223+
if errors.As(err, &apiErr) {
224+
if apiErr.ErrorCode() == "InvalidVolume.NotFound" {
225+
b.log.WithField("volumeID", volumeID).Debug("Volume not found yet, continuing to wait")
226+
time.Sleep(b.volumePollInterval)
227+
continue
228+
}
229+
}
230+
231+
// For other errors, return immediately with enhanced context
232+
return b.enhanceVolumeCreationError(err, volumeID)
233+
}
234+
235+
state := volume.State
236+
b.log.WithFields(logrus.Fields{
237+
"volumeID": volumeID,
238+
"state": state,
239+
}).Debug("Volume status check")
240+
241+
switch state {
242+
case types.VolumeStateAvailable:
243+
b.log.WithField("volumeID", volumeID).Info("Volume successfully created and available")
244+
return nil
245+
case types.VolumeStateError:
246+
return errors.Errorf("volume %s creation failed with state 'error'. This often indicates KMS permission issues for encrypted snapshots. Required KMS permissions: kms:Decrypt, kms:ReEncrypt*, kms:CreateGrant", volumeID)
247+
case types.VolumeStateCreating:
248+
// Volume is still being created, continue waiting
249+
b.log.WithField("volumeID", volumeID).Debug("Volume is still being created")
250+
default:
251+
b.log.WithFields(logrus.Fields{
252+
"volumeID": volumeID,
253+
"state": state,
254+
}).Debug("Volume in intermediate state, continuing to wait")
255+
}
256+
257+
time.Sleep(b.volumePollInterval)
258+
}
259+
}
260+
261+
// enhanceVolumeCreationError provides more detailed error messages for common volume creation failures
262+
func (b *VolumeSnapshotter) enhanceVolumeCreationError(err error, volumeID string) error {
263+
var apiErr smithy.APIError
264+
if errors.As(err, &apiErr) {
265+
switch apiErr.ErrorCode() {
266+
case "UnauthorizedOperation":
267+
return errors.Errorf("insufficient permissions to access volume %s or related KMS key. Required KMS permissions: kms:Decrypt, kms:ReEncrypt*, kms:CreateGrant. Original error: %v", volumeID, err)
268+
case "InvalidKey.Malformed", "KMSKeyNotAccessibleFault":
269+
return errors.Errorf("KMS key access failed for volume %s. Ensure the KMS key exists and grants necessary permissions: kms:Decrypt, kms:ReEncrypt*, kms:CreateGrant. Original error: %v", volumeID, err)
270+
default:
271+
return errors.Wrapf(err, "failed to verify volume %s creation status", volumeID)
272+
}
273+
}
274+
return errors.Wrapf(err, "failed to verify volume %s creation status", volumeID)
275+
}
276+
162277
func (b *VolumeSnapshotter) CreateSnapshot(volumeID, volumeAZ string, tags map[string]string) (string, error) {
163278
// describe the volume so we can copy its tags to the snapshot
164279
volumeInfo, err := b.describeVolume(volumeID)

velero-plugin-for-aws/volume_snapshotter_test.go

Lines changed: 135 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@ package main
1818

1919
import (
2020
"encoding/json"
21+
"errors"
2122
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
23+
"github.com/aws/smithy-go"
2224
"os"
2325
"sort"
2426
"testing"
27+
"time"
2528

2629
"github.com/sirupsen/logrus"
2730
"github.com/stretchr/testify/assert"
@@ -109,7 +112,7 @@ func TestGetVolumeIDForCSI(t *testing.T) {
109112
Object: map[string]interface{}{},
110113
}
111114
csi := map[string]interface{}{}
112-
json.Unmarshal([]byte(tt.csiJSON), &csi)
115+
_ = json.Unmarshal([]byte(tt.csiJSON), &csi)
113116
res.Object["spec"] = map[string]interface{}{
114117
"csi": csi,
115118
}
@@ -223,7 +226,7 @@ func TestSetVolumeIDForCSI(t *testing.T) {
223226
Object: map[string]interface{}{},
224227
}
225228
csi := map[string]interface{}{}
226-
json.Unmarshal([]byte(tt.csiJSON), &csi)
229+
_ = json.Unmarshal([]byte(tt.csiJSON), &csi)
227230
res.Object["spec"] = map[string]interface{}{
228231
"csi": csi,
229232
}
@@ -319,7 +322,7 @@ func TestGetTagsForCluster(t *testing.T) {
319322
assert.Equal(t, test.expected, res)
320323

321324
if test.isNameSet {
322-
os.Unsetenv("AWS_CLUSTER_NAME")
325+
_ = os.Unsetenv("AWS_CLUSTER_NAME")
323326
}
324327
})
325328
}
@@ -405,3 +408,132 @@ func TestGetTags(t *testing.T) {
405408
})
406409
}
407410
}
411+
412+
func TestVolumeSnapshotterInit_VolumeCreationTimeout(t *testing.T) {
413+
tests := []struct {
414+
name string
415+
config map[string]string
416+
expectedError bool
417+
expectedTimeout time.Duration
418+
expectedInterval time.Duration
419+
}{
420+
{
421+
name: "default timeout and interval",
422+
config: map[string]string{
423+
"region": "us-east-1",
424+
},
425+
expectedTimeout: defaultVolumeCreationTimeout,
426+
expectedInterval: volumeStatusPollInterval,
427+
},
428+
{
429+
name: "custom timeout and interval",
430+
config: map[string]string{
431+
"region": "us-east-1",
432+
"volumeCreationTimeout": "5m",
433+
"volumeCreationPollInterval": "30s",
434+
},
435+
expectedTimeout: 5 * time.Minute,
436+
expectedInterval: 30 * time.Second,
437+
},
438+
{
439+
name: "invalid timeout format",
440+
config: map[string]string{
441+
"region": "us-east-1",
442+
"volumeCreationTimeout": "invalid",
443+
},
444+
expectedError: true,
445+
},
446+
{
447+
name: "invalid interval format",
448+
config: map[string]string{
449+
"region": "us-east-1",
450+
"volumeCreationPollInterval": "invalid",
451+
},
452+
expectedError: true,
453+
},
454+
}
455+
456+
for _, tt := range tests {
457+
t.Run(tt.name, func(t *testing.T) {
458+
vs := newVolumeSnapshotter(logrus.New())
459+
err := vs.Init(tt.config)
460+
461+
if tt.expectedError {
462+
assert.Error(t, err)
463+
return
464+
}
465+
466+
require.NoError(t, err)
467+
assert.Equal(t, tt.expectedTimeout, vs.volumeCreationTimeout)
468+
assert.Equal(t, tt.expectedInterval, vs.volumePollInterval)
469+
})
470+
}
471+
}
472+
473+
// mockAPIError implements smithy.APIError for testing
474+
type mockAPIError struct {
475+
errorCode string
476+
message string
477+
}
478+
479+
func (m mockAPIError) Error() string {
480+
return m.message
481+
}
482+
483+
func (m mockAPIError) ErrorCode() string {
484+
return m.errorCode
485+
}
486+
487+
func (m mockAPIError) ErrorMessage() string {
488+
return m.message
489+
}
490+
491+
func (m mockAPIError) ErrorFault() smithy.ErrorFault {
492+
return smithy.FaultClient
493+
}
494+
495+
func TestEnhanceVolumeCreationError(t *testing.T) {
496+
vs := &VolumeSnapshotter{
497+
log: logrus.New(),
498+
}
499+
volumeID := "vol-123456"
500+
501+
tests := []struct {
502+
name string
503+
inputError error
504+
expectedError string
505+
}{
506+
{
507+
name: "UnauthorizedOperation error",
508+
inputError: mockAPIError{errorCode: "UnauthorizedOperation", message: "access denied"},
509+
expectedError: "insufficient permissions to access volume vol-123456 or related KMS key. Required KMS permissions: kms:Decrypt, kms:ReEncrypt*, kms:CreateGrant. Original error: access denied",
510+
},
511+
{
512+
name: "InvalidKey.Malformed error",
513+
inputError: mockAPIError{errorCode: "InvalidKey.Malformed", message: "invalid key"},
514+
expectedError: "KMS key access failed for volume vol-123456. Ensure the KMS key exists and grants necessary permissions: kms:Decrypt, kms:ReEncrypt*, kms:CreateGrant. Original error: invalid key",
515+
},
516+
{
517+
name: "KMSKeyNotAccessibleFault error",
518+
inputError: mockAPIError{errorCode: "KMSKeyNotAccessibleFault", message: "kms key not accessible"},
519+
expectedError: "KMS key access failed for volume vol-123456. Ensure the KMS key exists and grants necessary permissions: kms:Decrypt, kms:ReEncrypt*, kms:CreateGrant. Original error: kms key not accessible",
520+
},
521+
{
522+
name: "Other API error",
523+
inputError: mockAPIError{errorCode: "SomeOtherError", message: "some other error"},
524+
expectedError: "failed to verify volume vol-123456 creation status: some other error",
525+
},
526+
{
527+
name: "Non-API error",
528+
inputError: errors.New("generic error"),
529+
expectedError: "failed to verify volume vol-123456 creation status: generic error",
530+
},
531+
}
532+
533+
for _, tt := range tests {
534+
t.Run(tt.name, func(t *testing.T) {
535+
result := vs.enhanceVolumeCreationError(tt.inputError, volumeID)
536+
assert.Contains(t, result.Error(), tt.expectedError)
537+
})
538+
}
539+
}

0 commit comments

Comments
 (0)