From ae5768d46d7e96fa6abb47c2b5c5d8f9723ba1f5 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:33:10 +0800 Subject: [PATCH 01/22] extend: add fs and vfio attacher interfaces and specs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define the Attacher/Lister interfaces and Spec validation for two runtime hot-attach surfaces — vhost-user-fs (typically via virtiofsd) and VFIO PCI passthrough. Backends opt in via type assertion; CLI gives clear errors on unsupported backends. State semantics: attach is runtime-only, never persisted, lost on VM stop. Cocoon does not own the backend (virtiofsd / vfio-pci binding). --- extend/fs/fs.go | 97 ++++++++++++++++++++++++++++++++++++++++ extend/fs/fs_test.go | 54 ++++++++++++++++++++++ extend/vfio/vfio.go | 97 ++++++++++++++++++++++++++++++++++++++++ extend/vfio/vfio_test.go | 77 +++++++++++++++++++++++++++++++ 4 files changed, 325 insertions(+) create mode 100644 extend/fs/fs.go create mode 100644 extend/fs/fs_test.go create mode 100644 extend/vfio/vfio.go create mode 100644 extend/vfio/vfio_test.go diff --git a/extend/fs/fs.go b/extend/fs/fs.go new file mode 100644 index 0000000..07b9a3a --- /dev/null +++ b/extend/fs/fs.go @@ -0,0 +1,97 @@ +// Package fs is the runtime attach interface for vhost-user-fs devices +// (typically backed by virtiofsd) on a running VM. +// +// State semantics: attach is runtime-only. Attached devices are not persisted +// in the VM record and disappear when the VM stops. The user must re-attach +// after a restart. Cocoon does not own the virtiofsd backend. +package fs + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "regexp" +) + +const ( + DefaultNumQueues = 1 + DefaultQueueSize = 1024 + + // Tag length cap mirrors the conservative virtio-fs mount tag limit; + // longer tags risk truncation across kernels. + MaxTagLen = 36 +) + +var ( + // Tag charset is intentionally portable: usable as a CH device id suffix + // (cocoon-fs-) and safe for shell quoting and guest mount commands. + validTagRe = regexp.MustCompile(`^[a-zA-Z0-9][a-zA-Z0-9_-]{0,35}$`) + + // ErrUnsupportedBackend signals the resolved hypervisor backend cannot + // hot-plug vhost-user-fs (e.g. Firecracker). + ErrUnsupportedBackend = errors.New("backend does not support fs attach") +) + +// Spec is one attach request. +type Spec struct { + Socket string + Tag string + NumQueues int + QueueSize int +} + +// Attached is the inspect-time view of one fs device read from the +// running VM's CH config. +type Attached struct { + ID string `json:"id"` + Tag string `json:"tag"` + Socket string `json:"socket"` +} + +// Attacher hot-plugs and removes vhost-user-fs devices. +type Attacher interface { + FsAttach(ctx context.Context, vmRef string, spec Spec) (deviceID string, err error) + FsDetach(ctx context.Context, vmRef, tag string) error +} + +// Lister enumerates currently-attached fs devices from running VM state. +type Lister interface { + FsList(ctx context.Context, vmRef string) ([]Attached, error) +} + +// Validate enforces required fields and applies queue-size defaults. +func (s *Spec) Validate() error { + if s.Socket == "" { + return fmt.Errorf("--socket is required") + } + if !filepath.IsAbs(s.Socket) { + return fmt.Errorf("--socket must be absolute, got %q", s.Socket) + } + if s.Tag == "" { + return fmt.Errorf("--tag is required") + } + if !validTagRe.MatchString(s.Tag) { + return fmt.Errorf("--tag %q invalid: must match [A-Za-z0-9][A-Za-z0-9_-]{0,35}", s.Tag) + } + if s.NumQueues < 0 { + return fmt.Errorf("--num-queues must be non-negative, got %d", s.NumQueues) + } + if s.QueueSize < 0 { + return fmt.Errorf("--queue-size must be non-negative, got %d", s.QueueSize) + } + if s.NumQueues == 0 { + s.NumQueues = DefaultNumQueues + } + if s.QueueSize == 0 { + s.QueueSize = DefaultQueueSize + } + return nil +} + +// DeriveID returns the deterministic CH device id for a tag. Both attach +// (passed to vm.add-fs) and detach (matched against vm.info) use this so +// concurrent attaches of the same tag fail at CH's id-uniqueness check. +func DeriveID(tag string) string { + return "cocoon-fs-" + tag +} diff --git a/extend/fs/fs_test.go b/extend/fs/fs_test.go new file mode 100644 index 0000000..6207c77 --- /dev/null +++ b/extend/fs/fs_test.go @@ -0,0 +1,54 @@ +package fs + +import ( + "strings" + "testing" +) + +func TestSpecValidate(t *testing.T) { + tests := []struct { + name string + spec Spec + wantErr string + wantNumQ int + wantQSize int + }{ + {name: "minimal valid", spec: Spec{Socket: "/tmp/x.sock", Tag: "share"}, wantNumQ: DefaultNumQueues, wantQSize: DefaultQueueSize}, + {name: "explicit queues", spec: Spec{Socket: "/tmp/x.sock", Tag: "share", NumQueues: 4, QueueSize: 256}, wantNumQ: 4, wantQSize: 256}, + {name: "missing socket", spec: Spec{Tag: "share"}, wantErr: "--socket is required"}, + {name: "relative socket", spec: Spec{Socket: "rel.sock", Tag: "share"}, wantErr: "--socket must be absolute"}, + {name: "missing tag", spec: Spec{Socket: "/tmp/x.sock"}, wantErr: "--tag is required"}, + {name: "tag with slash", spec: Spec{Socket: "/tmp/x.sock", Tag: "a/b"}, wantErr: "--tag"}, + {name: "tag too long", spec: Spec{Socket: "/tmp/x.sock", Tag: strings.Repeat("a", 37)}, wantErr: "--tag"}, + {name: "negative queues", spec: Spec{Socket: "/tmp/x.sock", Tag: "share", NumQueues: -1}, wantErr: "non-negative"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.spec.Validate() + if tt.wantErr != "" { + if err == nil { + t.Fatalf("want error containing %q, got nil", tt.wantErr) + } + if !strings.Contains(err.Error(), tt.wantErr) { + t.Fatalf("want error containing %q, got %v", tt.wantErr, err) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if tt.spec.NumQueues != tt.wantNumQ { + t.Errorf("NumQueues = %d, want %d", tt.spec.NumQueues, tt.wantNumQ) + } + if tt.spec.QueueSize != tt.wantQSize { + t.Errorf("QueueSize = %d, want %d", tt.spec.QueueSize, tt.wantQSize) + } + }) + } +} + +func TestDeriveID(t *testing.T) { + if got := DeriveID("share"); got != "cocoon-fs-share" { + t.Errorf("DeriveID(share) = %q, want cocoon-fs-share", got) + } +} diff --git a/extend/vfio/vfio.go b/extend/vfio/vfio.go new file mode 100644 index 0000000..d4454d6 --- /dev/null +++ b/extend/vfio/vfio.go @@ -0,0 +1,97 @@ +// Package vfio is the runtime attach interface for VFIO PCI passthrough +// devices on a running VM. Typical use cases: GPU, NIC, NVMe. +// +// State semantics: attach is runtime-only. Attached devices are not +// persisted in the VM record and disappear when the VM stops. Host-side +// IOMMU enablement and vfio-pci driver binding are the user's +// responsibility. +package vfio + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "regexp" + "strings" +) + +const ( + // SysfsPCIPrefix is the canonical host path for a PCI device. + SysfsPCIPrefix = "/sys/bus/pci/devices/" +) + +var ( + // Match BDF in either short (01:00.0) or full (0000:01:00.0) form so the + // CLI accepts what `lspci` prints by default. + bdfShortRe = regexp.MustCompile(`^[0-9a-f]{2}:[0-9a-f]{2}\.[0-7]$`) + bdfFullRe = regexp.MustCompile(`^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-7]$`) + + // User-facing id charset matches CH device-id constraints; the prefix + // "cocoon-" is reserved so cocoon-derived ids never collide. + validIDRe = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9_.-]{0,63}$`) + + // ErrUnsupportedBackend signals the resolved hypervisor backend cannot + // hot-plug VFIO devices (e.g. Firecracker). + ErrUnsupportedBackend = errors.New("backend does not support device attach") +) + +// Spec is one attach request. PCI may be a short BDF, full BDF, or a sysfs +// path; NormalizePath canonicalizes it. +type Spec struct { + PCI string + ID string +} + +// Attached is the inspect-time view of one VFIO device from running VM state. +type Attached struct { + ID string `json:"id"` + BDF string `json:"bdf,omitempty"` +} + +// Attacher hot-plugs and removes VFIO PCI passthrough devices. +type Attacher interface { + DeviceAttach(ctx context.Context, vmRef string, spec Spec) (deviceID string, err error) + DeviceDetach(ctx context.Context, vmRef, id string) error +} + +// Lister enumerates VFIO devices from running VM state. +type Lister interface { + DeviceList(ctx context.Context, vmRef string) ([]Attached, error) +} + +// Validate checks the user input shape only. Path existence is asserted +// later by the backend right before calling CH (the host file may be +// removed between CLI parse and the API call). +func (s *Spec) Validate() error { + if s.PCI == "" { + return fmt.Errorf("--pci is required") + } + if s.ID != "" && (strings.HasPrefix(s.ID, "cocoon-") || !validIDRe.MatchString(s.ID)) { + return fmt.Errorf("--id %q invalid: must match [A-Za-z0-9][A-Za-z0-9_.-]{0,63} and not start with cocoon-", s.ID) + } + if _, err := NormalizePath(s.PCI); err != nil { + return err + } + return nil +} + +// NormalizePath maps any of {01:00.0, 0000:01:00.0, /sys/bus/pci/devices/} +// into the canonical sysfs path that CH's vm.add-device expects. +func NormalizePath(input string) (string, error) { + in := strings.TrimSpace(input) + if in == "" { + return "", fmt.Errorf("empty pci path") + } + if strings.HasPrefix(in, "/") { + return filepath.Clean(in), nil + } + low := strings.ToLower(in) + switch { + case bdfFullRe.MatchString(low): + return SysfsPCIPrefix + low, nil + case bdfShortRe.MatchString(low): + return SysfsPCIPrefix + "0000:" + low, nil + } + return "", fmt.Errorf("--pci %q invalid: expect BDF (01:00.0 / 0000:01:00.0) or sysfs path", input) +} diff --git a/extend/vfio/vfio_test.go b/extend/vfio/vfio_test.go new file mode 100644 index 0000000..1e06a7b --- /dev/null +++ b/extend/vfio/vfio_test.go @@ -0,0 +1,77 @@ +package vfio + +import ( + "strings" + "testing" +) + +func TestNormalizePath(t *testing.T) { + tests := []struct { + name string + in string + want string + wantErr string + }{ + {name: "short BDF", in: "01:00.0", want: SysfsPCIPrefix + "0000:01:00.0"}, + {name: "full BDF", in: "0000:01:00.0", want: SysfsPCIPrefix + "0000:01:00.0"}, + {name: "uppercase BDF", in: "00:1F.7", want: SysfsPCIPrefix + "0000:00:1f.7"}, + {name: "sysfs path", in: SysfsPCIPrefix + "0000:01:00.0", want: SysfsPCIPrefix + "0000:01:00.0"}, + {name: "sysfs path with trailing slash", in: SysfsPCIPrefix + "0000:01:00.0/", want: SysfsPCIPrefix + "0000:01:00.0"}, + {name: "absolute non-sysfs path", in: "/dev/null", want: "/dev/null"}, + {name: "empty", in: "", wantErr: "empty"}, + {name: "garbage", in: "not-a-bdf", wantErr: "invalid"}, + {name: "bad function digit", in: "01:00.8", wantErr: "invalid"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := NormalizePath(tt.in) + if tt.wantErr != "" { + if err == nil { + t.Fatalf("want error containing %q, got nil", tt.wantErr) + } + if !strings.Contains(err.Error(), tt.wantErr) { + t.Fatalf("want error containing %q, got %v", tt.wantErr, err) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tt.want { + t.Errorf("got %q, want %q", got, tt.want) + } + }) + } +} + +func TestSpecValidate(t *testing.T) { + tests := []struct { + name string + spec Spec + wantErr string + }{ + {name: "valid short BDF", spec: Spec{PCI: "01:00.0"}}, + {name: "valid full BDF with id", spec: Spec{PCI: "0000:01:00.0", ID: "mygpu"}}, + {name: "missing pci", spec: Spec{}, wantErr: "--pci is required"}, + {name: "id with cocoon prefix", spec: Spec{PCI: "01:00.0", ID: "cocoon-x"}, wantErr: "--id"}, + {name: "id with bad chars", spec: Spec{PCI: "01:00.0", ID: "bad id"}, wantErr: "--id"}, + {name: "bad pci", spec: Spec{PCI: "junk"}, wantErr: "invalid"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.spec.Validate() + if tt.wantErr != "" { + if err == nil { + t.Fatalf("want error containing %q, got nil", tt.wantErr) + } + if !strings.Contains(err.Error(), tt.wantErr) { + t.Fatalf("want error containing %q, got %v", tt.wantErr, err) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + } +} From 8fa8a2b806b8d1e4b43719381c71dc9b07418272 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:35:22 +0800 Subject: [PATCH 02/22] vm: add shared-memory config for CH virtio-fs readiness Vhost-user-fs hot-plug requires CH memory shared=on; this is fixed at VM creation. Add VMConfig.SharedMemory, plumb through CH chMemory and the --memory CLI arg, propagate via clone/restore. Reject --fc with --shared-memory since Firecracker has no virtio-fs. patchCHConfig already preserves memory.shared via raw-JSON merging, so clone/restore round-trip without further work. --- cmd/core/helpers.go | 3 +++ cmd/vm/commands.go | 1 + cmd/vm/run.go | 3 +++ hypervisor/cloudhypervisor/api.go | 1 + hypervisor/cloudhypervisor/args.go | 5 ++++- types/config.go | 5 +++++ 6 files changed, 17 insertions(+), 1 deletion(-) diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index adeb084..6149785 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -306,6 +306,7 @@ func VMConfigFromFlags(cmd *cobra.Command, image string) (*types.VMConfig, error password, _ := cmd.Flags().GetString("password") noDirectIO, _ := cmd.Flags().GetBool("no-direct-io") windows, _ := cmd.Flags().GetBool("windows") + sharedMemory, _ := cmd.Flags().GetBool("shared-memory") dataDiskRaw, _ := cmd.Flags().GetStringArray("data-disk") if vmName == "" { @@ -338,6 +339,7 @@ func VMConfigFromFlags(cmd *cobra.Command, image string) (*types.VMConfig, error Network: network, NoDirectIO: noDirectIO, Windows: windows, + SharedMemory: sharedMemory, }, User: user, Password: password, @@ -384,6 +386,7 @@ func CloneVMConfigFromFlags(cmd *cobra.Command, snapCfg *types.SnapshotConfig) ( Network: network, NoDirectIO: noDirectIO, Windows: snapCfg.Windows, + SharedMemory: snapCfg.SharedMemory, }, OnDemand: onDemand, } diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index a36576e..c51dd4a 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -171,6 +171,7 @@ func addVMFlags(cmd *cobra.Command) { cmd.Flags().String("password", "cocoon", "guest password for cloud-init (cloudimg only)") cmd.Flags().Bool("no-direct-io", false, "disable O_DIRECT on writable disks (use page cache instead; CH only)") cmd.Flags().Bool("windows", false, "Windows guest (UEFI boot, kvm_hyperv=on, no cidata)") + cmd.Flags().Bool("shared-memory", false, "enable CH memory shared=on; required to attach vhost-user-fs later (CH only, fixed for VM lifetime)") cmd.Flags().StringArray("data-disk", nil, "extra data disk: size=20G[,name=...][,fstype=ext4|none][,mount=/mnt/x][,directio=on|off|auto]; repeatable") } diff --git a/cmd/vm/run.go b/cmd/vm/run.go index cdd47ad..e72f3e4 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -332,6 +332,9 @@ func (h Handler) createVM(cmd *cobra.Command, image string) (context.Context, *t if conf.UseFirecracker && vmCfg.Windows { return nil, nil, nil, fmt.Errorf("--fc and --windows are mutually exclusive: Firecracker does not support Windows guests") } + if conf.UseFirecracker && vmCfg.SharedMemory { + return nil, nil, nil, fmt.Errorf("--fc and --shared-memory are mutually exclusive: Firecracker does not support vhost-user-fs hot-plug") + } bridgeDev, _ := cmd.Flags().GetString("bridge") if bridgeDev != "" && vmCfg.Network != "" { return nil, nil, nil, fmt.Errorf("--bridge and --network are mutually exclusive") diff --git a/hypervisor/cloudhypervisor/api.go b/hypervisor/cloudhypervisor/api.go index cb3665a..644ed2b 100644 --- a/hypervisor/cloudhypervisor/api.go +++ b/hypervisor/cloudhypervisor/api.go @@ -44,6 +44,7 @@ type chCPUs struct { type chMemory struct { Size int64 `json:"size"` HugePages bool `json:"hugepages,omitempty"` + Shared bool `json:"shared,omitempty"` } type chDisk struct { diff --git a/hypervisor/cloudhypervisor/args.go b/hypervisor/cloudhypervisor/args.go index 2817e53..b9d09c4 100644 --- a/hypervisor/cloudhypervisor/args.go +++ b/hypervisor/cloudhypervisor/args.go @@ -49,7 +49,7 @@ func buildVMConfig(_ context.Context, rec *hypervisor.VMRecord, consoleSockPath cfg := &chVMConfig{ CPUs: chCPUs{BootVCPUs: cpu, MaxVCPUs: maxVCPUs, KVMHyperV: rec.Config.Windows}, - Memory: chMemory{Size: mem, HugePages: utils.DetectHugePages()}, + Memory: chMemory{Size: mem, HugePages: utils.DetectHugePages(), Shared: rec.Config.SharedMemory}, RNG: chRNG{Src: "/dev/urandom"}, Watchdog: true, } @@ -109,6 +109,9 @@ func buildCLIArgs(cfg *chVMConfig, socketPath string) []string { if cfg.Memory.HugePages { mem += ",hugepages=on" } + if cfg.Memory.Shared { + mem += ",shared=on" + } args = append(args, "--memory", mem) if len(cfg.Disks) > 0 { diff --git a/types/config.go b/types/config.go index 52801c1..f0fe967 100644 --- a/types/config.go +++ b/types/config.go @@ -15,4 +15,9 @@ type Config struct { Network string `json:"network,omitempty"` // CNI conflist name; empty = default NoDirectIO bool `json:"no_direct_io,omitempty"` // disable O_DIRECT on writable disks Windows bool `json:"windows,omitempty"` // Windows guest: UEFI boot, kvm_hyperv=on, no cidata + // SharedMemory toggles CH memory shared=on, the prerequisite for + // vhost-user-fs hot-plug. Decided at VM creation: the memory model is + // fixed for the VM's lifetime and propagates through clone/restore via + // the persisted config and the snapshot-time CH config.json. + SharedMemory bool `json:"shared_memory,omitempty"` } From a63968e538d495eda656e6e55dd28fb3429c57f1 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:37:19 +0800 Subject: [PATCH 03/22] cloudhypervisor: add vm.add-fs / vm.add-device API helpers Introduce chFs / chDevice / chPciDeviceInfo types matching CH OpenAPI, extend chVMInfoConfig to surface fs and devices arrays, and add addFsVM / addDeviceVM / getVMInfo / decodePciDeviceInfo helpers. vmAPICall extracted from vmAPI so callers needing the response body share the existing retry logic. Vhost-user-fs and VFIO add use the returned PciDeviceInfo to record the device id. --- hypervisor/cloudhypervisor/api.go | 34 +++++++++++++++-- hypervisor/cloudhypervisor/helper.go | 57 +++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/hypervisor/cloudhypervisor/api.go b/hypervisor/cloudhypervisor/api.go index 644ed2b..953e1ce 100644 --- a/hypervisor/cloudhypervisor/api.go +++ b/hypervisor/cloudhypervisor/api.go @@ -83,9 +83,35 @@ type chRuntimeFile struct { Socket string `json:"socket,omitempty"` } +type chFs struct { + ID string `json:"id,omitempty"` + Tag string `json:"tag"` + Socket string `json:"socket"` + NumQueues int `json:"num_queues"` + QueueSize int `json:"queue_size"` +} + +type chDevice struct { + ID string `json:"id,omitempty"` + Path string `json:"path"` + IOMMU bool `json:"iommu,omitempty"` +} + +// chPciDeviceInfo is the response body from vm.add-fs / vm.add-device / +// vm.add-disk / vm.add-net (HTTP 200). +type chPciDeviceInfo struct { + ID string `json:"id"` + BDF string `json:"bdf"` +} + type chVMInfoResponse struct { - Config struct { - Serial chRuntimeFile `json:"serial"` - Console chRuntimeFile `json:"console"` - } `json:"config"` + Config chVMInfoConfig `json:"config"` +} + +type chVMInfoConfig struct { + Serial chRuntimeFile `json:"serial"` + Console chRuntimeFile `json:"console"` + Memory chMemory `json:"memory"` + Fs []chFs `json:"fs,omitempty"` + Devices []chDevice `json:"devices,omitempty"` } diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index fd7ae06..3ed2ad8 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -94,10 +94,16 @@ func hasMemoryRangeFile(srcDir string) (bool, error) { } func vmAPI(ctx context.Context, hc *http.Client, endpoint string, body []byte, successCodes ...int) error { - _, err := utils.DoAPIWithRetry(ctx, hc, http.MethodPut, "http://localhost/api/v1/"+endpoint, body, successCodes...) + _, err := vmAPICall(ctx, hc, endpoint, body, successCodes...) return err } +// vmAPICall returns the raw response body so callers that need to decode +// PciDeviceInfo (vm.add-fs, vm.add-device, ...) can use the same retry path. +func vmAPICall(ctx context.Context, hc *http.Client, endpoint string, body []byte, successCodes ...int) ([]byte, error) { + return utils.DoAPIWithRetry(ctx, hc, http.MethodPut, "http://localhost/api/v1/"+endpoint, body, successCodes...) +} + func shutdownVM(ctx context.Context, hc *http.Client) error { return vmAPI(ctx, hc, "vm.shutdown", nil) } @@ -168,6 +174,55 @@ func addNetVM(ctx context.Context, hc *http.Client, net chNet) error { return vmAPI(ctx, hc, "vm.add-net", body, http.StatusOK, http.StatusNoContent) } +func addFsVM(ctx context.Context, hc *http.Client, fs chFs) (chPciDeviceInfo, error) { + body, err := json.Marshal(fs) + if err != nil { + return chPciDeviceInfo{}, fmt.Errorf("marshal add-fs request: %w", err) + } + resp, err := vmAPICall(ctx, hc, "vm.add-fs", body, http.StatusOK, http.StatusNoContent) + if err != nil { + return chPciDeviceInfo{}, err + } + return decodePciDeviceInfo(resp) +} + +func addDeviceVM(ctx context.Context, hc *http.Client, dev chDevice) (chPciDeviceInfo, error) { + body, err := json.Marshal(dev) + if err != nil { + return chPciDeviceInfo{}, fmt.Errorf("marshal add-device request: %w", err) + } + resp, err := vmAPICall(ctx, hc, "vm.add-device", body, http.StatusOK, http.StatusNoContent) + if err != nil { + return chPciDeviceInfo{}, err + } + return decodePciDeviceInfo(resp) +} + +// getVMInfo fetches vm.info; cocoon uses it to detect tag/id conflicts +// before hot-add and to surface attached devices through inspect. +func getVMInfo(ctx context.Context, hc *http.Client) (*chVMInfoResponse, error) { + body, err := utils.DoAPI(ctx, hc, http.MethodGet, "http://localhost/api/v1/vm.info", nil, http.StatusOK) + if err != nil { + return nil, fmt.Errorf("query vm.info: %w", err) + } + var info chVMInfoResponse + if err := json.Unmarshal(body, &info); err != nil { + return nil, fmt.Errorf("decode vm.info: %w", err) + } + return &info, nil +} + +func decodePciDeviceInfo(resp []byte) (chPciDeviceInfo, error) { + if len(resp) == 0 { + return chPciDeviceInfo{}, nil + } + var info chPciDeviceInfo + if err := json.Unmarshal(resp, &info); err != nil { + return chPciDeviceInfo{}, fmt.Errorf("decode PciDeviceInfo: %w", err) + } + return info, nil +} + func powerButton(ctx context.Context, hc *http.Client) error { return vmAPI(ctx, hc, "vm.power-button", nil) } From 1da43b98ea3aefa5f9f42e34a86315f68fd86938 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:38:17 +0800 Subject: [PATCH 04/22] cloudhypervisor: implement fs.Attacher and vfio.Attacher FsAttach: validate spec, assert VM running and memory.shared=on, detect tag/id collision via vm.info, call vm.add-fs with id derived from tag (cocoon-fs-) so concurrent attaches collide on CH's id-uniqueness check. DeviceAttach: normalize PCI input to canonical sysfs path, assert it exists and is a directory, detect path/id collision via vm.info, call vm.add-device. Detach common: vm.remove-device. FsList / DeviceList read vm.info and translate into the runtime-only inspect DTOs from the extend packages. --- hypervisor/cloudhypervisor/extend.go | 227 +++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 hypervisor/cloudhypervisor/extend.go diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go new file mode 100644 index 0000000..d6f5e15 --- /dev/null +++ b/hypervisor/cloudhypervisor/extend.go @@ -0,0 +1,227 @@ +package cloudhypervisor + +import ( + "context" + "errors" + "fmt" + "net/http" + "os" + "strings" + + "github.com/cocoonstack/cocoon/extend/fs" + "github.com/cocoonstack/cocoon/extend/vfio" + "github.com/cocoonstack/cocoon/hypervisor" + "github.com/cocoonstack/cocoon/types" + "github.com/cocoonstack/cocoon/utils" +) + +var ( + _ fs.Attacher = (*CloudHypervisor)(nil) + _ fs.Lister = (*CloudHypervisor)(nil) + _ vfio.Attacher = (*CloudHypervisor)(nil) + _ vfio.Lister = (*CloudHypervisor)(nil) +) + +// FsAttach hot-plugs a vhost-user-fs device onto a running CH VM. +func (ch *CloudHypervisor) FsAttach(ctx context.Context, vmRef string, spec fs.Spec) (string, error) { + if err := spec.Validate(); err != nil { + return "", err + } + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + return "", err + } + info, err := getVMInfo(ctx, hc) + if err != nil { + return "", err + } + if !info.Config.Memory.Shared { + return "", fmt.Errorf("fs attach requires VM memory shared=on; recreate the VM with --shared-memory") + } + id := fs.DeriveID(spec.Tag) + for _, existing := range info.Config.Fs { + if existing.Tag == spec.Tag { + return "", fmt.Errorf("fs tag %q already attached", spec.Tag) + } + if existing.ID == id { + return "", fmt.Errorf("fs id %q already attached", id) + } + } + resp, err := addFsVM(ctx, hc, chFs{ + ID: id, + Tag: spec.Tag, + Socket: spec.Socket, + NumQueues: spec.NumQueues, + QueueSize: spec.QueueSize, + }) + if err != nil { + return "", fmt.Errorf("vm.add-fs: %w", err) + } + if resp.ID != "" { + return resp.ID, nil + } + return id, nil +} + +// FsDetach removes a previously attached vhost-user-fs device by tag. +func (ch *CloudHypervisor) FsDetach(ctx context.Context, vmRef, tag string) error { + if tag == "" { + return fmt.Errorf("--tag is required") + } + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + return err + } + info, err := getVMInfo(ctx, hc) + if err != nil { + return err + } + deviceID := "" + for _, existing := range info.Config.Fs { + if existing.Tag == tag { + deviceID = existing.ID + break + } + } + if deviceID == "" { + return fmt.Errorf("fs tag %q not attached", tag) + } + if err := removeDeviceVM(ctx, hc, deviceID); err != nil { + return fmt.Errorf("vm.remove-device %s: %w", deviceID, err) + } + return nil +} + +// FsList enumerates currently attached vhost-user-fs devices. +func (ch *CloudHypervisor) FsList(ctx context.Context, vmRef string) ([]fs.Attached, error) { + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + if errors.Is(err, hypervisor.ErrNotRunning) { + return nil, nil + } + return nil, err + } + info, err := getVMInfo(ctx, hc) + if err != nil { + return nil, err + } + out := make([]fs.Attached, 0, len(info.Config.Fs)) + for _, f := range info.Config.Fs { + out = append(out, fs.Attached{ID: f.ID, Tag: f.Tag, Socket: f.Socket}) + } + return out, nil +} + +// DeviceAttach hot-plugs a VFIO PCI passthrough device onto a running CH VM. +func (ch *CloudHypervisor) DeviceAttach(ctx context.Context, vmRef string, spec vfio.Spec) (string, error) { + if err := spec.Validate(); err != nil { + return "", err + } + path, err := vfio.NormalizePath(spec.PCI) + if err != nil { + return "", err + } + if st, statErr := os.Stat(path); statErr != nil { + return "", fmt.Errorf("pci path %s: %w", path, statErr) + } else if !st.IsDir() { + return "", fmt.Errorf("pci path %s: not a directory", path) + } + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + return "", err + } + info, err := getVMInfo(ctx, hc) + if err != nil { + return "", err + } + for _, existing := range info.Config.Devices { + if existing.Path == path { + return "", fmt.Errorf("device %s already attached (id=%s)", path, existing.ID) + } + if spec.ID != "" && existing.ID == spec.ID { + return "", fmt.Errorf("device id %q already in use", spec.ID) + } + } + resp, err := addDeviceVM(ctx, hc, chDevice{ID: spec.ID, Path: path}) + if err != nil { + return "", fmt.Errorf("vm.add-device: %w", err) + } + if resp.ID != "" { + return resp.ID, nil + } + return spec.ID, nil +} + +// DeviceDetach removes a previously attached VFIO device by id. +func (ch *CloudHypervisor) DeviceDetach(ctx context.Context, vmRef, id string) error { + if id == "" { + return fmt.Errorf("--id is required") + } + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + return err + } + info, err := getVMInfo(ctx, hc) + if err != nil { + return err + } + found := false + for _, existing := range info.Config.Devices { + if existing.ID == id { + found = true + break + } + } + if !found { + return fmt.Errorf("device id %q not attached", id) + } + if err := removeDeviceVM(ctx, hc, id); err != nil { + return fmt.Errorf("vm.remove-device %s: %w", id, err) + } + return nil +} + +// DeviceList enumerates currently attached VFIO PCI passthrough devices. +func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio.Attached, error) { + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + if errors.Is(err, hypervisor.ErrNotRunning) { + return nil, nil + } + return nil, err + } + info, err := getVMInfo(ctx, hc) + if err != nil { + return nil, err + } + out := make([]vfio.Attached, 0, len(info.Config.Devices)) + for _, d := range info.Config.Devices { + out = append(out, vfio.Attached{ID: d.ID, BDF: bdfFromSysfsPath(d.Path)}) + } + return out, nil +} + +// runningVMClient resolves vmRef, asserts the VM is in Running state, and +// returns an http.Client connected to its CH API socket. +func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (*http.Client, error) { + vmID, err := ch.ResolveRef(ctx, vmRef) + if err != nil { + return nil, err + } + rec, err := ch.LoadRecord(ctx, vmID) + if err != nil { + return nil, err + } + if rec.State != types.VMStateRunning { + return nil, fmt.Errorf("vm %s is %s: %w", vmID, rec.State, hypervisor.ErrNotRunning) + } + if !utils.IsProcessAlive(rec.PID) { + return nil, fmt.Errorf("vm %s pid %d not alive: %w", vmID, rec.PID, hypervisor.ErrNotRunning) + } + return utils.NewSocketHTTPClient(hypervisor.SocketPath(rec.RunDir)), nil +} + +// bdfFromSysfsPath strips /sys/bus/pci/devices/ prefix when present. +func bdfFromSysfsPath(p string) string { + return strings.TrimPrefix(p, vfio.SysfsPCIPrefix) +} From a7aad99f00391543e6309de55a853726e6f6dd94 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:39:46 +0800 Subject: [PATCH 05/22] cmd/vm: add fs and device attach/detach subcommands cocoon vm fs attach/detach VM --socket / --tag / [num/queue size] cocoon vm device attach/detach VM --pci / [--id] Handlers type-assert to extend/fs.Attacher and extend/vfio.Attacher; unsupported backends (Firecracker today) report a clear error naming the backend. ErrNotRunning is unwrapped into a friendlier message so users see why the call failed. --- cmd/vm/attach.go | 124 +++++++++++++++++++++++++++++++++++++++++++++ cmd/vm/commands.go | 71 ++++++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 cmd/vm/attach.go diff --git a/cmd/vm/attach.go b/cmd/vm/attach.go new file mode 100644 index 0000000..2ba3f92 --- /dev/null +++ b/cmd/vm/attach.go @@ -0,0 +1,124 @@ +package vm + +import ( + "errors" + "fmt" + + "github.com/projecteru2/core/log" + "github.com/spf13/cobra" + + cmdcore "github.com/cocoonstack/cocoon/cmd/core" + "github.com/cocoonstack/cocoon/extend/fs" + "github.com/cocoonstack/cocoon/extend/vfio" + "github.com/cocoonstack/cocoon/hypervisor" +) + +func (h Handler) FsAttach(cmd *cobra.Command, args []string) error { + ctx, conf, err := h.Init(cmd) + if err != nil { + return err + } + hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) + if err != nil { + return fmt.Errorf("fs attach: %w", err) + } + a, ok := hyper.(fs.Attacher) + if !ok { + return fmt.Errorf("fs attach: backend %s: %w", hyper.Type(), fs.ErrUnsupportedBackend) + } + socket, _ := cmd.Flags().GetString("socket") + tag, _ := cmd.Flags().GetString("tag") + numQ, _ := cmd.Flags().GetInt("num-queues") + qSize, _ := cmd.Flags().GetInt("queue-size") + id, err := a.FsAttach(ctx, args[0], fs.Spec{Socket: socket, Tag: tag, NumQueues: numQ, QueueSize: qSize}) + if err != nil { + return classifyAttachErr(err) + } + if done, jsonErr := cmdcore.MaybeOutputJSON(cmd, map[string]string{"vm": args[0], "tag": tag, "id": id}); done { + return jsonErr + } + log.WithFunc("cmd.vm.fs.attach").Infof(ctx, "attached fs tag=%s id=%s vm=%s", tag, id, args[0]) + return nil +} + +func (h Handler) FsDetach(cmd *cobra.Command, args []string) error { + ctx, conf, err := h.Init(cmd) + if err != nil { + return err + } + hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) + if err != nil { + return fmt.Errorf("fs detach: %w", err) + } + a, ok := hyper.(fs.Attacher) + if !ok { + return fmt.Errorf("fs detach: backend %s: %w", hyper.Type(), fs.ErrUnsupportedBackend) + } + tag, _ := cmd.Flags().GetString("tag") + if err := a.FsDetach(ctx, args[0], tag); err != nil { + return classifyAttachErr(err) + } + if done, jsonErr := cmdcore.MaybeOutputJSON(cmd, map[string]string{"vm": args[0], "tag": tag}); done { + return jsonErr + } + log.WithFunc("cmd.vm.fs.detach").Infof(ctx, "detached fs tag=%s vm=%s", tag, args[0]) + return nil +} + +func (h Handler) DeviceAttach(cmd *cobra.Command, args []string) error { + ctx, conf, err := h.Init(cmd) + if err != nil { + return err + } + hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) + if err != nil { + return fmt.Errorf("device attach: %w", err) + } + a, ok := hyper.(vfio.Attacher) + if !ok { + return fmt.Errorf("device attach: backend %s: %w", hyper.Type(), vfio.ErrUnsupportedBackend) + } + pci, _ := cmd.Flags().GetString("pci") + id, _ := cmd.Flags().GetString("id") + deviceID, err := a.DeviceAttach(ctx, args[0], vfio.Spec{PCI: pci, ID: id}) + if err != nil { + return classifyAttachErr(err) + } + if done, jsonErr := cmdcore.MaybeOutputJSON(cmd, map[string]string{"vm": args[0], "pci": pci, "id": deviceID}); done { + return jsonErr + } + log.WithFunc("cmd.vm.device.attach").Infof(ctx, "attached device pci=%s id=%s vm=%s", pci, deviceID, args[0]) + return nil +} + +func (h Handler) DeviceDetach(cmd *cobra.Command, args []string) error { + ctx, conf, err := h.Init(cmd) + if err != nil { + return err + } + hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) + if err != nil { + return fmt.Errorf("device detach: %w", err) + } + a, ok := hyper.(vfio.Attacher) + if !ok { + return fmt.Errorf("device detach: backend %s: %w", hyper.Type(), vfio.ErrUnsupportedBackend) + } + id, _ := cmd.Flags().GetString("id") + if err := a.DeviceDetach(ctx, args[0], id); err != nil { + return classifyAttachErr(err) + } + if done, jsonErr := cmdcore.MaybeOutputJSON(cmd, map[string]string{"vm": args[0], "id": id}); done { + return jsonErr + } + log.WithFunc("cmd.vm.device.detach").Infof(ctx, "detached device id=%s vm=%s", id, args[0]) + return nil +} + +// classifyAttachErr surfaces ErrNotRunning more clearly than the generic wrap. +func classifyAttachErr(err error) error { + if errors.Is(err, hypervisor.ErrNotRunning) { + return fmt.Errorf("vm is not running: %w", err) + } + return err +} diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index c51dd4a..fa2167b 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -20,6 +20,10 @@ type Actions interface { Restore(cmd *cobra.Command, args []string) error Debug(cmd *cobra.Command, args []string) error Status(cmd *cobra.Command, args []string) error + FsAttach(cmd *cobra.Command, args []string) error + FsDetach(cmd *cobra.Command, args []string) error + DeviceAttach(cmd *cobra.Command, args []string) error + DeviceDetach(cmd *cobra.Command, args []string) error } // Command builds the "vm" parent command with all subcommands. @@ -152,10 +156,77 @@ func Command(h Actions) *cobra.Command { restoreCmd, debugCmd, statusCmd, + buildFsCommand(h), + buildDeviceCommand(h), ) return vmCmd } +func buildFsCommand(h Actions) *cobra.Command { + fsCmd := &cobra.Command{ + Use: "fs", + Short: "Attach/detach a vhost-user-fs share to a running VM (CH only)", + } + + attachCmd := &cobra.Command{ + Use: "attach VM", + Short: "Attach a vhost-user-fs device to a running VM", + Args: cobra.ExactArgs(1), + RunE: h.FsAttach, + } + attachCmd.Flags().String("socket", "", "absolute path to a virtiofsd unix socket (required)") + attachCmd.Flags().String("tag", "", "guest mount tag (required; also detach key)") + attachCmd.Flags().Int("num-queues", 0, "request queues (0 = default 1)") + attachCmd.Flags().Int("queue-size", 0, "queue depth (0 = default 1024)") //nolint:mnd + _ = attachCmd.MarkFlagRequired("socket") + _ = attachCmd.MarkFlagRequired("tag") + cmdcore.AddOutputFlag(attachCmd) + + detachCmd := &cobra.Command{ + Use: "detach VM", + Short: "Detach a vhost-user-fs device from a running VM", + Args: cobra.ExactArgs(1), + RunE: h.FsDetach, + } + detachCmd.Flags().String("tag", "", "guest mount tag (required)") + _ = detachCmd.MarkFlagRequired("tag") + cmdcore.AddOutputFlag(detachCmd) + + fsCmd.AddCommand(attachCmd, detachCmd) + return fsCmd +} + +func buildDeviceCommand(h Actions) *cobra.Command { + devCmd := &cobra.Command{ + Use: "device", + Short: "Attach/detach a VFIO PCI passthrough device to a running VM (CH only)", + } + + attachCmd := &cobra.Command{ + Use: "attach VM", + Short: "Attach a VFIO PCI device to a running VM", + Args: cobra.ExactArgs(1), + RunE: h.DeviceAttach, + } + attachCmd.Flags().String("pci", "", "BDF (01:00.0 / 0000:01:00.0) or sysfs path /sys/bus/pci/devices/") + attachCmd.Flags().String("id", "", "optional device id; CH auto-generates if empty (must not start with cocoon-)") + _ = attachCmd.MarkFlagRequired("pci") + cmdcore.AddOutputFlag(attachCmd) + + detachCmd := &cobra.Command{ + Use: "detach VM", + Short: "Detach a VFIO PCI device from a running VM", + Args: cobra.ExactArgs(1), + RunE: h.DeviceDetach, + } + detachCmd.Flags().String("id", "", "device id returned by attach (required)") + _ = detachCmd.MarkFlagRequired("id") + cmdcore.AddOutputFlag(detachCmd) + + devCmd.AddCommand(attachCmd, detachCmd) + return devCmd +} + func addVMFlags(cmd *cobra.Command) { cmd.Flags().Bool("fc", false, "use Firecracker backend instead of Cloud Hypervisor (OCI images only)") cmd.Flags().String("name", "", "VM name") From 6d9df669662a054e83bcec780d2c96960776f228 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:40:27 +0800 Subject: [PATCH 06/22] cmd/vm: extend inspect with runtime attached devices Wrap types.VM in an inspect-only DTO that adds attached_devices when the VM is running and the backend implements fs.Lister / vfio.Lister. Listing failures are logged and dropped: inspect should not fail just because vm.info is briefly unreachable. Cocoon never persists attached devices, so types.VM stays unchanged. --- cmd/vm/lifecycle.go | 49 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 16f5c94..1b90946 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -13,12 +13,28 @@ import ( cmdcore "github.com/cocoonstack/cocoon/cmd/core" "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/console" + "github.com/cocoonstack/cocoon/extend/fs" + "github.com/cocoonstack/cocoon/extend/vfio" "github.com/cocoonstack/cocoon/hypervisor" "github.com/cocoonstack/cocoon/network" bridgenet "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/types" ) +// attachedDevices is the inspect-only view of runtime hot-plugged devices. +// Cocoon never persists this structure; it is read from CH vm.info on demand. +type attachedDevices struct { + Fs []fs.Attached `json:"fs,omitempty"` + Devices []vfio.Attached `json:"devices,omitempty"` +} + +// inspectOutput wraps types.VM with an extra runtime field. Defined in the +// CLI layer to keep types.VM free of inspect-only fields. +type inspectOutput struct { + *types.VM + AttachedDevices *attachedDevices `json:"attached_devices,omitempty"` +} + func (h Handler) Start(cmd *cobra.Command, args []string) error { ctx, conf, err := h.Init(cmd) if err != nil { @@ -88,7 +104,38 @@ func (h Handler) Inspect(cmd *cobra.Command, args []string) error { return fmt.Errorf("inspect: %w", err) } info.State = types.VMState(cmdcore.ReconcileState(info)) - return cmdcore.OutputJSON(info) + + out := inspectOutput{VM: info} + if info.State == types.VMStateRunning { + out.AttachedDevices = collectAttachedDevices(ctx, hyper, args[0]) + } + return cmdcore.OutputJSON(out) +} + +// collectAttachedDevices reads runtime fs/vfio device lists from the +// backend. Errors are logged and dropped — inspect should not fail just +// because vm.info is briefly unreachable. +func collectAttachedDevices(ctx context.Context, hyper hypervisor.Hypervisor, ref string) *attachedDevices { + logger := log.WithFunc("cmd.vm.inspect") + out := &attachedDevices{} + if l, ok := hyper.(fs.Lister); ok { + if devs, err := l.FsList(ctx, ref); err != nil { + logger.Warnf(ctx, "list fs devices for %s: %v", ref, err) + } else { + out.Fs = devs + } + } + if l, ok := hyper.(vfio.Lister); ok { + if devs, err := l.DeviceList(ctx, ref); err != nil { + logger.Warnf(ctx, "list vfio devices for %s: %v", ref, err) + } else { + out.Devices = devs + } + } + if len(out.Fs) == 0 && len(out.Devices) == 0 { + return nil + } + return out } func (h Handler) Console(cmd *cobra.Command, args []string) error { From 401c2bef149731951c7f8d1e1a8f127b2985483c Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:41:46 +0800 Subject: [PATCH 07/22] docs: document runtime device attach and constraints README: new 'Runtime Device Attach' section, --shared-memory flag, vm fs / vm device subcommands in the CLI tree. KNOWN_ISSUES: shared-memory creation prereq, CH rejection of snapshot when fs/vfio attached, runtime-only lifecycle (lost on stop/clone/restore). --- KNOWN_ISSUES.md | 12 ++++++++++ README.md | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/KNOWN_ISSUES.md b/KNOWN_ISSUES.md index da6ea76..b678647 100644 --- a/KNOWN_ISSUES.md +++ b/KNOWN_ISSUES.md @@ -213,3 +213,15 @@ On CNI plugins with strict per-veth MAC enforcement (Cilium eBPF, Calico eBPF), **Upstream status**: FC's `NetworkOverride` struct only has `iface_id` and `host_dev_name` — no `guest_mac` field. Adding it would follow the existing `VsockOverride` pattern. No issue or PR exists yet. **Workaround**: If MAC matching is required, run `ip link set dev ethX address ` inside the guest after clone (the post-clone hints print the expected MAC values). + +## Vhost-user-fs requires VM-level shared memory + +`cocoon vm fs attach` only works on CH VMs that were created with `--shared-memory`. CH's `memory shared=on` is fixed at VM creation: backend processes (e.g. virtiofsd) need to mmap guest memory via the negotiated memfd, and the memory model cannot be flipped on a running VM. If `--shared-memory` was omitted at create time, the only path is to recreate the VM. Cocoon's preflight reads `vm.info` and surfaces a clear error rather than letting CH return a vague rejection. + +## Snapshotting a VM with attached vhost-user-fs / VFIO is rejected by CH + +Cloud Hypervisor refuses to snapshot a VM that holds a vhost-user-fs share or a VFIO PCI passthrough device. Cocoon does not block the call client-side (the rejection comes from CH itself); the surfaced error explains the cause. `cocoon vm fs detach` / `cocoon vm device detach` first to clear runtime devices, then snapshot. + +## Runtime attached devices do not survive VM stop / clone / restore + +Attaches via `cocoon vm fs attach` and `cocoon vm device attach` are runtime-only — they live in the CH process state and are never written into the VM record, sidecar, or snapshot. After `vm stop`, `vm clone`, or `vm restore`, the user must re-run the attach commands. `vm inspect` reflects the live CH `vm.info` for running VMs and omits `attached_devices` for stopped VMs. This is by design: cocoon does not own the backend (virtiofsd / vfio-pci binding) and cannot guarantee the resource still exists across host events. diff --git a/README.md b/README.md index 2d495be..bb12561 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,12 @@ cocoon │ ├── rm [flags] VM [VM...] Delete VM(s) (--force to stop first) │ ├── restore [flags] VM SNAP Restore a running VM to a snapshot │ ├── status [VM...] Watch VM status in real time +│ ├── fs +│ │ ├── attach [flags] VM Attach a vhost-user-fs share (CH only) +│ │ └── detach [flags] VM Detach a vhost-user-fs share by --tag +│ ├── device +│ │ ├── attach [flags] VM Attach a VFIO PCI device (CH only) +│ │ └── detach [flags] VM Detach a VFIO PCI device by --id │ └── debug [flags] IMAGE Generate hypervisor launch command (dry run) ├── snapshot │ ├── save [flags] VM Create a snapshot from a running VM @@ -187,6 +193,7 @@ Applies to `cocoon vm create`, `cocoon vm run`, and `cocoon vm debug`: | `--no-direct-io` | `false` | Disable O_DIRECT on writable disks (use page cache; CH only, useful for dev/test with few VMs) | | `--data-disk` | empty (repeatable) | Attach an extra data disk: `size=20G[,name=...][,fstype=ext4|none][,mount=/mnt/x][,directio=on|off|auto]`. See [Data Disks](#data-disks) | | `--windows` | `false` | Windows guest (UEFI boot, kvm_hyperv=on, no cidata) | +| `--shared-memory` | `false` | Enable CH `memory shared=on`; required for later `vm fs attach` (CH only, fixed for VM lifetime) | ### Clone Flags @@ -374,6 +381,62 @@ Phase 1 inherits data disks 1:1: snapshot reflinks each `data-.raw` into t Restore preflight verifies sidecar integrity, file presence (vmstate, memory, COW, every `data-*.raw`), and per-index Role/Path/RO match between sidecar and CH config.json **before** killing the running VM, so a malformed or imported snapshot fails fast and leaves the live VM untouched. +## Runtime Device Attach (Cloud Hypervisor only) + +Cocoon can hot-plug two classes of external resources onto a running VM: + +- **Vhost-user-fs** — a file share served by an external `virtiofsd` (or compatible backend) over a Unix socket. Attach surfaces a virtio-fs device in the guest, accessible via `mount -t virtiofs /mnt/...`. +- **VFIO PCI passthrough** — a host PCI device bound to `vfio-pci` (GPU, NIC, NVMe). Attach hands the device to the guest with IOMMU isolation. + +Both attaches are **runtime-only**: the device lives only for the current VM process and is gone after stop/restart. Cocoon does not own the backend lifecycle (the user runs `virtiofsd`, binds the PCI device, etc.). Attached devices are not part of the VM record and are not preserved by snapshot / clone / restore. Cloud Hypervisor itself rejects snapshotting a VM that has vhost-user or VFIO devices attached, so plan accordingly. + +### Vhost-user-fs + +Prerequisite: the VM must have been created with `--shared-memory`. CH's `memory shared=on` cannot be flipped on a running VM, and vhost-user-fs requires it to share guest memory with the backend process. + +```bash +# 1) Boot VM with shared-memory enabled. +cocoon vm run --shared-memory --name share-host ghcr.io/cocoonstack/cocoon/ubuntu:24.04 + +# 2) On host, run virtiofsd against a directory. +virtiofsd --socket-path=/tmp/virtiofsd.sock --shared-dir=/srv/data --cache=never & + +# 3) Attach to the VM (tag is the guest mount tag and detach key). +cocoon vm fs attach share-host --socket /tmp/virtiofsd.sock --tag data + +# 4) Inside the guest: +mkdir -p /mnt/data && mount -t virtiofs data /mnt/data + +# 5) Detach later: +cocoon vm fs detach share-host --tag data +``` + +Flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--socket` | required | Absolute path to the virtiofsd unix socket | +| `--tag` | required | Guest mount tag (also detach key) | +| `--num-queues` | `1` | Request queues | +| `--queue-size` | `1024` | Queue depth | + +### VFIO PCI passthrough + +Prerequisite: host has `intel_iommu=on` (or `amd_iommu=on`) on the kernel command line and the target PCI device is bound to `vfio-pci`. + +```bash +# Bind the device on the host (one-time per device). +echo 0000:01:00.0 > /sys/bus/pci/drivers/vfio-pci/bind # see https://wiki.archlinux.org/title/PCI_passthrough_via_OVMF + +# Attach (BDF or full sysfs path both accepted). +cocoon vm device attach my-vm --pci 01:00.0 --id mygpu + +# Detach. +cocoon vm device detach my-vm --id mygpu +``` + +`cocoon vm inspect VM` includes an `attached_devices` field for running VMs that surfaces every attached vhost-user-fs share and VFIO device, read live from CH `vm.info`. The field is omitted for stopped VMs. + ## Windows Support Cocoon supports Windows guests via the `--windows` flag: From 112a7b08ca53e9c51c550ae3f93562e495f512a3 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 01:51:02 +0800 Subject: [PATCH 08/22] extend: senior-review cleanup pass - runningVMClient: switch to utils.VerifyProcess for PID-reuse safety (cocoon convention; bare IsProcessAlive can talk to a reused PID) - vfio.Spec: collapse Validate+NormalizePath into NormalizedPath, removing the redundant regex match in DeviceAttach - bdfFromSysfsPath: return empty when CH reports a non-PCI host path rather than echoing the raw path back as a BDF - chDevice: drop unused IOMMU field; v1 does not expose --iommu - FsAttach: clearer shared-memory error referring to creation flag - queryConsolePTY: use shared getVMInfo helper - commands.go: extract attachGroup to fold buildFsCommand and buildDeviceCommand cobra builders into a single shape --- cmd/vm/commands.go | 118 +++++++++++++++------------ extend/fs/fs.go | 4 - extend/vfio/vfio.go | 25 +++--- extend/vfio/vfio_test.go | 12 ++- hypervisor/cloudhypervisor/api.go | 5 +- hypervisor/cloudhypervisor/extend.go | 20 +++-- hypervisor/cloudhypervisor/helper.go | 14 ++-- 7 files changed, 104 insertions(+), 94 deletions(-) diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index fa2167b..52da71b 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -162,69 +162,85 @@ func Command(h Actions) *cobra.Command { return vmCmd } -func buildFsCommand(h Actions) *cobra.Command { - fsCmd := &cobra.Command{ - Use: "fs", - Short: "Attach/detach a vhost-user-fs share to a running VM (CH only)", - } +// attachGroup describes a parent command that bundles attach/detach +// subcommands sharing the same VM-arg shape and JSON-output flag. +type attachGroup struct { + parent string + parentDesc string + attachDesc string + detachDesc string + attachRun func(*cobra.Command, []string) error + detachRun func(*cobra.Command, []string) error + attachFlag func(*cobra.Command) + detachFlag func(*cobra.Command) +} - attachCmd := &cobra.Command{ +func (g attachGroup) build() *cobra.Command { + parent := &cobra.Command{Use: g.parent, Short: g.parentDesc} + + attach := &cobra.Command{ Use: "attach VM", - Short: "Attach a vhost-user-fs device to a running VM", + Short: g.attachDesc, Args: cobra.ExactArgs(1), - RunE: h.FsAttach, + RunE: g.attachRun, } - attachCmd.Flags().String("socket", "", "absolute path to a virtiofsd unix socket (required)") - attachCmd.Flags().String("tag", "", "guest mount tag (required; also detach key)") - attachCmd.Flags().Int("num-queues", 0, "request queues (0 = default 1)") - attachCmd.Flags().Int("queue-size", 0, "queue depth (0 = default 1024)") //nolint:mnd - _ = attachCmd.MarkFlagRequired("socket") - _ = attachCmd.MarkFlagRequired("tag") - cmdcore.AddOutputFlag(attachCmd) + g.attachFlag(attach) + cmdcore.AddOutputFlag(attach) - detachCmd := &cobra.Command{ + detach := &cobra.Command{ Use: "detach VM", - Short: "Detach a vhost-user-fs device from a running VM", + Short: g.detachDesc, Args: cobra.ExactArgs(1), - RunE: h.FsDetach, + RunE: g.detachRun, } - detachCmd.Flags().String("tag", "", "guest mount tag (required)") - _ = detachCmd.MarkFlagRequired("tag") - cmdcore.AddOutputFlag(detachCmd) + g.detachFlag(detach) + cmdcore.AddOutputFlag(detach) - fsCmd.AddCommand(attachCmd, detachCmd) - return fsCmd + parent.AddCommand(attach, detach) + return parent } -func buildDeviceCommand(h Actions) *cobra.Command { - devCmd := &cobra.Command{ - Use: "device", - Short: "Attach/detach a VFIO PCI passthrough device to a running VM (CH only)", - } - - attachCmd := &cobra.Command{ - Use: "attach VM", - Short: "Attach a VFIO PCI device to a running VM", - Args: cobra.ExactArgs(1), - RunE: h.DeviceAttach, - } - attachCmd.Flags().String("pci", "", "BDF (01:00.0 / 0000:01:00.0) or sysfs path /sys/bus/pci/devices/") - attachCmd.Flags().String("id", "", "optional device id; CH auto-generates if empty (must not start with cocoon-)") - _ = attachCmd.MarkFlagRequired("pci") - cmdcore.AddOutputFlag(attachCmd) - - detachCmd := &cobra.Command{ - Use: "detach VM", - Short: "Detach a VFIO PCI device from a running VM", - Args: cobra.ExactArgs(1), - RunE: h.DeviceDetach, - } - detachCmd.Flags().String("id", "", "device id returned by attach (required)") - _ = detachCmd.MarkFlagRequired("id") - cmdcore.AddOutputFlag(detachCmd) +func buildFsCommand(h Actions) *cobra.Command { + return attachGroup{ + parent: "fs", + parentDesc: "Attach/detach a vhost-user-fs share to a running VM (CH only)", + attachDesc: "Attach a vhost-user-fs device to a running VM", + detachDesc: "Detach a vhost-user-fs device from a running VM", + attachRun: h.FsAttach, + detachRun: h.FsDetach, + attachFlag: func(c *cobra.Command) { + c.Flags().String("socket", "", "absolute path to a virtiofsd unix socket (required)") + c.Flags().String("tag", "", "guest mount tag (required; also detach key)") + c.Flags().Int("num-queues", 0, "request queues (0 = default 1)") + c.Flags().Int("queue-size", 0, "queue depth (0 = default 1024)") //nolint:mnd + _ = c.MarkFlagRequired("socket") + _ = c.MarkFlagRequired("tag") + }, + detachFlag: func(c *cobra.Command) { + c.Flags().String("tag", "", "guest mount tag (required)") + _ = c.MarkFlagRequired("tag") + }, + }.build() +} - devCmd.AddCommand(attachCmd, detachCmd) - return devCmd +func buildDeviceCommand(h Actions) *cobra.Command { + return attachGroup{ + parent: "device", + parentDesc: "Attach/detach a VFIO PCI passthrough device to a running VM (CH only)", + attachDesc: "Attach a VFIO PCI device to a running VM", + detachDesc: "Detach a VFIO PCI device from a running VM", + attachRun: h.DeviceAttach, + detachRun: h.DeviceDetach, + attachFlag: func(c *cobra.Command) { + c.Flags().String("pci", "", "BDF (01:00.0 / 0000:01:00.0) or sysfs path /sys/bus/pci/devices/") + c.Flags().String("id", "", "optional device id; CH auto-generates if empty (must not start with cocoon-)") + _ = c.MarkFlagRequired("pci") + }, + detachFlag: func(c *cobra.Command) { + c.Flags().String("id", "", "device id returned by attach (required)") + _ = c.MarkFlagRequired("id") + }, + }.build() } func addVMFlags(cmd *cobra.Command) { diff --git a/extend/fs/fs.go b/extend/fs/fs.go index 07b9a3a..869ecc5 100644 --- a/extend/fs/fs.go +++ b/extend/fs/fs.go @@ -17,10 +17,6 @@ import ( const ( DefaultNumQueues = 1 DefaultQueueSize = 1024 - - // Tag length cap mirrors the conservative virtio-fs mount tag limit; - // longer tags risk truncation across kernels. - MaxTagLen = 36 ) var ( diff --git a/extend/vfio/vfio.go b/extend/vfio/vfio.go index d4454d6..1ad8fad 100644 --- a/extend/vfio/vfio.go +++ b/extend/vfio/vfio.go @@ -16,10 +16,8 @@ import ( "strings" ) -const ( - // SysfsPCIPrefix is the canonical host path for a PCI device. - SysfsPCIPrefix = "/sys/bus/pci/devices/" -) +// SysfsPCIPrefix is the canonical host path for a PCI device. +const SysfsPCIPrefix = "/sys/bus/pci/devices/" var ( // Match BDF in either short (01:00.0) or full (0000:01:00.0) form so the @@ -60,20 +58,19 @@ type Lister interface { DeviceList(ctx context.Context, vmRef string) ([]Attached, error) } -// Validate checks the user input shape only. Path existence is asserted -// later by the backend right before calling CH (the host file may be -// removed between CLI parse and the API call). -func (s *Spec) Validate() error { +// NormalizedPath validates the spec and returns the canonical sysfs path +// (NormalizePath already covers PCI shape validation, so callers do not +// need a separate Validate()). Path existence is asserted by the backend +// right before calling CH; the host file may be removed between CLI parse +// and the API call. +func (s *Spec) NormalizedPath() (string, error) { if s.PCI == "" { - return fmt.Errorf("--pci is required") + return "", fmt.Errorf("--pci is required") } if s.ID != "" && (strings.HasPrefix(s.ID, "cocoon-") || !validIDRe.MatchString(s.ID)) { - return fmt.Errorf("--id %q invalid: must match [A-Za-z0-9][A-Za-z0-9_.-]{0,63} and not start with cocoon-", s.ID) - } - if _, err := NormalizePath(s.PCI); err != nil { - return err + return "", fmt.Errorf("--id %q invalid: must match [A-Za-z0-9][A-Za-z0-9_.-]{0,63} and not start with cocoon-", s.ID) } - return nil + return NormalizePath(s.PCI) } // NormalizePath maps any of {01:00.0, 0000:01:00.0, /sys/bus/pci/devices/} diff --git a/extend/vfio/vfio_test.go b/extend/vfio/vfio_test.go index 1e06a7b..edcc5e8 100644 --- a/extend/vfio/vfio_test.go +++ b/extend/vfio/vfio_test.go @@ -44,14 +44,15 @@ func TestNormalizePath(t *testing.T) { } } -func TestSpecValidate(t *testing.T) { +func TestSpecNormalizedPath(t *testing.T) { tests := []struct { name string spec Spec + want string wantErr string }{ - {name: "valid short BDF", spec: Spec{PCI: "01:00.0"}}, - {name: "valid full BDF with id", spec: Spec{PCI: "0000:01:00.0", ID: "mygpu"}}, + {name: "valid short BDF", spec: Spec{PCI: "01:00.0"}, want: SysfsPCIPrefix + "0000:01:00.0"}, + {name: "valid full BDF with id", spec: Spec{PCI: "0000:01:00.0", ID: "mygpu"}, want: SysfsPCIPrefix + "0000:01:00.0"}, {name: "missing pci", spec: Spec{}, wantErr: "--pci is required"}, {name: "id with cocoon prefix", spec: Spec{PCI: "01:00.0", ID: "cocoon-x"}, wantErr: "--id"}, {name: "id with bad chars", spec: Spec{PCI: "01:00.0", ID: "bad id"}, wantErr: "--id"}, @@ -59,7 +60,7 @@ func TestSpecValidate(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - err := tt.spec.Validate() + got, err := tt.spec.NormalizedPath() if tt.wantErr != "" { if err == nil { t.Fatalf("want error containing %q, got nil", tt.wantErr) @@ -72,6 +73,9 @@ func TestSpecValidate(t *testing.T) { if err != nil { t.Fatalf("unexpected error: %v", err) } + if got != tt.want { + t.Errorf("got %q, want %q", got, tt.want) + } }) } } diff --git a/hypervisor/cloudhypervisor/api.go b/hypervisor/cloudhypervisor/api.go index 953e1ce..df8c2c4 100644 --- a/hypervisor/cloudhypervisor/api.go +++ b/hypervisor/cloudhypervisor/api.go @@ -92,9 +92,8 @@ type chFs struct { } type chDevice struct { - ID string `json:"id,omitempty"` - Path string `json:"path"` - IOMMU bool `json:"iommu,omitempty"` + ID string `json:"id,omitempty"` + Path string `json:"path"` } // chPciDeviceInfo is the response body from vm.add-fs / vm.add-device / diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index d6f5e15..8cdbee3 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -36,7 +36,7 @@ func (ch *CloudHypervisor) FsAttach(ctx context.Context, vmRef string, spec fs.S return "", err } if !info.Config.Memory.Shared { - return "", fmt.Errorf("fs attach requires VM memory shared=on; recreate the VM with --shared-memory") + return "", fmt.Errorf("fs attach requires the VM to be created with --shared-memory (current memory shared=off; cannot be flipped on a running VM)") } id := fs.DeriveID(spec.Tag) for _, existing := range info.Config.Fs { @@ -114,10 +114,7 @@ func (ch *CloudHypervisor) FsList(ctx context.Context, vmRef string) ([]fs.Attac // DeviceAttach hot-plugs a VFIO PCI passthrough device onto a running CH VM. func (ch *CloudHypervisor) DeviceAttach(ctx context.Context, vmRef string, spec vfio.Spec) (string, error) { - if err := spec.Validate(); err != nil { - return "", err - } - path, err := vfio.NormalizePath(spec.PCI) + path, err := spec.NormalizedPath() if err != nil { return "", err } @@ -201,7 +198,8 @@ func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio return out, nil } -// runningVMClient resolves vmRef, asserts the VM is in Running state, and +// runningVMClient resolves vmRef, asserts the VM is running with the +// recorded CH process still alive (PID-reuse-safe via VerifyProcess), and // returns an http.Client connected to its CH API socket. func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (*http.Client, error) { vmID, err := ch.ResolveRef(ctx, vmRef) @@ -215,13 +213,17 @@ func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (* if rec.State != types.VMStateRunning { return nil, fmt.Errorf("vm %s is %s: %w", vmID, rec.State, hypervisor.ErrNotRunning) } - if !utils.IsProcessAlive(rec.PID) { - return nil, fmt.Errorf("vm %s pid %d not alive: %w", vmID, rec.PID, hypervisor.ErrNotRunning) + if !utils.VerifyProcess(rec.PID, ch.conf.BinaryName()) { + return nil, fmt.Errorf("vm %s pid %d not %s: %w", vmID, rec.PID, ch.conf.BinaryName(), hypervisor.ErrNotRunning) } return utils.NewSocketHTTPClient(hypervisor.SocketPath(rec.RunDir)), nil } -// bdfFromSysfsPath strips /sys/bus/pci/devices/ prefix when present. +// bdfFromSysfsPath returns the BDF suffix when path is under the canonical +// sysfs PCI prefix; empty otherwise (CH may report a non-PCI host path). func bdfFromSysfsPath(p string) string { + if !strings.HasPrefix(p, vfio.SysfsPCIPrefix) { + return "" + } return strings.TrimPrefix(p, vfio.SysfsPCIPrefix) } diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 3ed2ad8..dd4882f 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -227,17 +227,13 @@ func powerButton(ctx context.Context, hc *http.Client) error { return vmAPI(ctx, hc, "vm.power-button", nil) } -// queryConsolePTY retrieves the virtio-console PTY path from a running CH instance -// via GET /api/v1/vm.info. Returns empty string if the console is not in Pty mode. +// queryConsolePTY retrieves the virtio-console PTY path from a running CH +// instance via GET /api/v1/vm.info. Returns empty string if the console is +// not in Pty mode. func queryConsolePTY(ctx context.Context, apiSocketPath string) (string, error) { - hc := utils.NewSocketHTTPClient(apiSocketPath) - body, err := utils.DoAPI(ctx, hc, http.MethodGet, "http://localhost/api/v1/vm.info", nil, http.StatusOK) + info, err := getVMInfo(ctx, utils.NewSocketHTTPClient(apiSocketPath)) if err != nil { - return "", fmt.Errorf("query vm.info: %w", err) - } - var info chVMInfoResponse - if err := json.Unmarshal(body, &info); err != nil { - return "", fmt.Errorf("decode vm.info: %w", err) + return "", err } if info.Config.Console.File == "" { return "", fmt.Errorf("console PTY not available (mode=%s)", info.Config.Console.Mode) From 0b591aba4607cd02a5759a08b0d9385b6fa93c98 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 02:12:07 +0800 Subject: [PATCH 09/22] cloudhypervisor: fix runningVMClient and DeviceAttach check order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runningVMClient was reading rec.PID from the VM record, but cocoon never stores the live PID there — it lives in the runDir/ch.pid file. The existing canonical gate (Backend.WithRunningVM) reads ReadPIDFile + VerifyProcessCmdline; mirror that pattern so attach/detach see a real PID instead of always 0 (which caused 'pid 0 not cloud-hypervisor' on healthy VMs). DeviceAttach: reorder os.Stat after runningVMClient so a stopped VM reports the VM-state error instead of a misleading host-path error. --- hypervisor/cloudhypervisor/extend.go | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index 8cdbee3..c892e3f 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -118,15 +118,17 @@ func (ch *CloudHypervisor) DeviceAttach(ctx context.Context, vmRef string, spec if err != nil { return "", err } + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + return "", err + } + // Stat after the running-VM check: a stopped VM should report the VM + // state error first, not a misleading host-path error. if st, statErr := os.Stat(path); statErr != nil { return "", fmt.Errorf("pci path %s: %w", path, statErr) } else if !st.IsDir() { return "", fmt.Errorf("pci path %s: not a directory", path) } - hc, err := ch.runningVMClient(ctx, vmRef) - if err != nil { - return "", err - } info, err := getVMInfo(ctx, hc) if err != nil { return "", err @@ -198,8 +200,8 @@ func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio return out, nil } -// runningVMClient resolves vmRef, asserts the VM is running with the -// recorded CH process still alive (PID-reuse-safe via VerifyProcess), and +// runningVMClient resolves vmRef, asserts the recorded CH process is still +// alive (PID file + cmdline match — same gate as WithRunningVM), and // returns an http.Client connected to its CH API socket. func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (*http.Client, error) { vmID, err := ch.ResolveRef(ctx, vmRef) @@ -213,10 +215,12 @@ func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (* if rec.State != types.VMStateRunning { return nil, fmt.Errorf("vm %s is %s: %w", vmID, rec.State, hypervisor.ErrNotRunning) } - if !utils.VerifyProcess(rec.PID, ch.conf.BinaryName()) { - return nil, fmt.Errorf("vm %s pid %d not %s: %w", vmID, rec.PID, ch.conf.BinaryName(), hypervisor.ErrNotRunning) + sockPath := hypervisor.SocketPath(rec.RunDir) + pid, _ := utils.ReadPIDFile(ch.PIDFilePath(rec.RunDir)) + if !utils.VerifyProcessCmdline(pid, ch.conf.BinaryName(), sockPath) { + return nil, fmt.Errorf("vm %s pid %d not %s: %w", vmID, pid, ch.conf.BinaryName(), hypervisor.ErrNotRunning) } - return utils.NewSocketHTTPClient(hypervisor.SocketPath(rec.RunDir)), nil + return utils.NewSocketHTTPClient(sockPath), nil } // bdfFromSysfsPath returns the BDF suffix when path is under the canonical From 63e35311f4c843654780e8ea17494eb59f76fd1c Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 09:40:24 +0800 Subject: [PATCH 10/22] cloudhypervisor: factor attachWith / detachWith / listWith helpers Fs and VFIO hot-plug skeletons collapse into three shared functions with per-call closures supplying the spec validation, conflict scan, device-id extraction, and CH endpoint. The four backend methods (FsAttach/FsDetach/FsList + DeviceAttach/DeviceDetach/DeviceList) shrink from independent ~30-line bodies to thin specs. addFsVM / addDeviceVM removed: attachWith now marshals + decodes PciDeviceInfo inline so the only call sites that needed those helpers no longer exist. Inline TODO on FsList / DeviceList notes the dual vm.info round-trip on inspect; consolidating into one combined Lister is left for follow-up since it requires a cross-package interface. --- hypervisor/cloudhypervisor/extend.go | 238 +++++++++++++++------------ hypervisor/cloudhypervisor/helper.go | 24 --- 2 files changed, 130 insertions(+), 132 deletions(-) diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index c892e3f..4aeeb6e 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -2,6 +2,7 @@ package cloudhypervisor import ( "context" + "encoding/json" "errors" "fmt" "net/http" @@ -27,89 +28,57 @@ func (ch *CloudHypervisor) FsAttach(ctx context.Context, vmRef string, spec fs.S if err := spec.Validate(); err != nil { return "", err } - hc, err := ch.runningVMClient(ctx, vmRef) - if err != nil { - return "", err - } - info, err := getVMInfo(ctx, hc) - if err != nil { - return "", err - } - if !info.Config.Memory.Shared { - return "", fmt.Errorf("fs attach requires the VM to be created with --shared-memory (current memory shared=off; cannot be flipped on a running VM)") - } id := fs.DeriveID(spec.Tag) - for _, existing := range info.Config.Fs { - if existing.Tag == spec.Tag { - return "", fmt.Errorf("fs tag %q already attached", spec.Tag) - } - if existing.ID == id { - return "", fmt.Errorf("fs id %q already attached", id) - } - } - resp, err := addFsVM(ctx, hc, chFs{ + return ch.attachWith(ctx, vmRef, "vm.add-fs", chFs{ ID: id, Tag: spec.Tag, Socket: spec.Socket, NumQueues: spec.NumQueues, QueueSize: spec.QueueSize, + }, id, func(info *chVMInfoResponse) error { + if !info.Config.Memory.Shared { + return fmt.Errorf("fs attach requires the VM to be created with --shared-memory (current memory shared=off; cannot be flipped on a running VM)") + } + for _, ex := range info.Config.Fs { + if ex.Tag == spec.Tag { + return fmt.Errorf("fs tag %q already attached", spec.Tag) + } + if ex.ID == id { + return fmt.Errorf("fs id %q already attached", id) + } + } + return nil }) - if err != nil { - return "", fmt.Errorf("vm.add-fs: %w", err) - } - if resp.ID != "" { - return resp.ID, nil - } - return id, nil } // FsDetach removes a previously attached vhost-user-fs device by tag. func (ch *CloudHypervisor) FsDetach(ctx context.Context, vmRef, tag string) error { if tag == "" { - return fmt.Errorf("--tag is required") - } - hc, err := ch.runningVMClient(ctx, vmRef) - if err != nil { - return err + return fmt.Errorf("tag is required") } - info, err := getVMInfo(ctx, hc) - if err != nil { - return err - } - deviceID := "" - for _, existing := range info.Config.Fs { - if existing.Tag == tag { - deviceID = existing.ID - break + return ch.detachWith(ctx, vmRef, func(info *chVMInfoResponse) (string, error) { + for _, ex := range info.Config.Fs { + if ex.Tag == tag { + return ex.ID, nil + } } - } - if deviceID == "" { - return fmt.Errorf("fs tag %q not attached", tag) - } - if err := removeDeviceVM(ctx, hc, deviceID); err != nil { - return fmt.Errorf("vm.remove-device %s: %w", deviceID, err) - } - return nil + return "", fmt.Errorf("fs tag %q not attached", tag) + }) } // FsList enumerates currently attached vhost-user-fs devices. +// +// TODO(inspect): cmd/vm Inspect calls FsList and DeviceList back-to-back, +// each fetching its own vm.info. A combined Lister returning both arrays +// from a single vm.info call would halve the round-trips on a running VM. func (ch *CloudHypervisor) FsList(ctx context.Context, vmRef string) ([]fs.Attached, error) { - hc, err := ch.runningVMClient(ctx, vmRef) - if err != nil { - if errors.Is(err, hypervisor.ErrNotRunning) { - return nil, nil + return listWith(ctx, ch, vmRef, func(info *chVMInfoResponse) []fs.Attached { + out := make([]fs.Attached, 0, len(info.Config.Fs)) + for _, f := range info.Config.Fs { + out = append(out, fs.Attached{ID: f.ID, Tag: f.Tag, Socket: f.Socket}) } - return nil, err - } - info, err := getVMInfo(ctx, hc) - if err != nil { - return nil, err - } - out := make([]fs.Attached, 0, len(info.Config.Fs)) - for _, f := range info.Config.Fs { - out = append(out, fs.Attached{ID: f.ID, Tag: f.Tag, Socket: f.Socket}) - } - return out, nil + return out + }) } // DeviceAttach hot-plugs a VFIO PCI passthrough device onto a running CH VM. @@ -118,44 +87,102 @@ func (ch *CloudHypervisor) DeviceAttach(ctx context.Context, vmRef string, spec if err != nil { return "", err } + return ch.attachWith(ctx, vmRef, "vm.add-device", chDevice{ + ID: spec.ID, + Path: path, + }, spec.ID, func(info *chVMInfoResponse) error { + // host stat happens after the running-VM gate inside attachWith, + // so a stopped VM reports the VM-state error instead of misleading + // host path output. + if st, statErr := os.Stat(path); statErr != nil { + return fmt.Errorf("pci path %s: %w", path, statErr) + } else if !st.IsDir() { + return fmt.Errorf("pci path %s: not a directory", path) + } + for _, ex := range info.Config.Devices { + if ex.Path == path { + return fmt.Errorf("device %s already attached (id=%s)", path, ex.ID) + } + if spec.ID != "" && ex.ID == spec.ID { + return fmt.Errorf("device id %q already in use", spec.ID) + } + } + return nil + }) +} + +// DeviceDetach removes a previously attached VFIO device by id. +func (ch *CloudHypervisor) DeviceDetach(ctx context.Context, vmRef, id string) error { + if id == "" { + return fmt.Errorf("id is required") + } + return ch.detachWith(ctx, vmRef, func(info *chVMInfoResponse) (string, error) { + for _, ex := range info.Config.Devices { + if ex.ID == id { + return id, nil + } + } + return "", fmt.Errorf("device id %q not attached", id) + }) +} + +// DeviceList enumerates currently attached VFIO PCI passthrough devices. +// +// TODO(inspect): see FsList note — combined Lister would dedupe vm.info. +func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio.Attached, error) { + return listWith(ctx, ch, vmRef, func(info *chVMInfoResponse) []vfio.Attached { + out := make([]vfio.Attached, 0, len(info.Config.Devices)) + for _, d := range info.Config.Devices { + out = append(out, vfio.Attached{ID: d.ID, BDF: bdfFromSysfsPath(d.Path)}) + } + return out + }) +} + +// attachWith is the shared skeleton for hot-add operations: gate on a +// live VM, fetch vm.info, run preCheck (memory/conflict/host-path), call +// the CH endpoint, return the device id (CH-assigned or fallback). +func (ch *CloudHypervisor) attachWith( + ctx context.Context, vmRef, endpoint string, + body any, fallbackID string, + preCheck func(*chVMInfoResponse) error, +) (string, error) { hc, err := ch.runningVMClient(ctx, vmRef) if err != nil { return "", err } - // Stat after the running-VM check: a stopped VM should report the VM - // state error first, not a misleading host-path error. - if st, statErr := os.Stat(path); statErr != nil { - return "", fmt.Errorf("pci path %s: %w", path, statErr) - } else if !st.IsDir() { - return "", fmt.Errorf("pci path %s: not a directory", path) - } info, err := getVMInfo(ctx, hc) if err != nil { return "", err } - for _, existing := range info.Config.Devices { - if existing.Path == path { - return "", fmt.Errorf("device %s already attached (id=%s)", path, existing.ID) - } - if spec.ID != "" && existing.ID == spec.ID { - return "", fmt.Errorf("device id %q already in use", spec.ID) - } + if checkErr := preCheck(info); checkErr != nil { + return "", checkErr } - resp, err := addDeviceVM(ctx, hc, chDevice{ID: spec.ID, Path: path}) + bodyBytes, err := json.Marshal(body) if err != nil { - return "", fmt.Errorf("vm.add-device: %w", err) + return "", fmt.Errorf("marshal %s: %w", endpoint, err) } - if resp.ID != "" { - return resp.ID, nil + resp, err := vmAPICall(ctx, hc, endpoint, bodyBytes, http.StatusOK, http.StatusNoContent) + if err != nil { + return "", fmt.Errorf("%s: %w", endpoint, err) } - return spec.ID, nil + pci, err := decodePciDeviceInfo(resp) + if err != nil { + return "", err + } + if pci.ID != "" { + return pci.ID, nil + } + return fallbackID, nil } -// DeviceDetach removes a previously attached VFIO device by id. -func (ch *CloudHypervisor) DeviceDetach(ctx context.Context, vmRef, id string) error { - if id == "" { - return fmt.Errorf("--id is required") - } +// detachWith is the shared skeleton for hot-remove: gate on a live VM, +// fetch vm.info, let the caller find the target device id, then call +// vm.remove-device. +func (ch *CloudHypervisor) detachWith( + ctx context.Context, vmRef string, + findID func(*chVMInfoResponse) (string, error), +) error { hc, err := ch.runningVMClient(ctx, vmRef) if err != nil { return err @@ -164,24 +191,23 @@ func (ch *CloudHypervisor) DeviceDetach(ctx context.Context, vmRef, id string) e if err != nil { return err } - found := false - for _, existing := range info.Config.Devices { - if existing.ID == id { - found = true - break - } - } - if !found { - return fmt.Errorf("device id %q not attached", id) + deviceID, err := findID(info) + if err != nil { + return err } - if err := removeDeviceVM(ctx, hc, id); err != nil { - return fmt.Errorf("vm.remove-device %s: %w", id, err) + if err := removeDeviceVM(ctx, hc, deviceID); err != nil { + return fmt.Errorf("vm.remove-device %s: %w", deviceID, err) } return nil } -// DeviceList enumerates currently attached VFIO PCI passthrough devices. -func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio.Attached, error) { +// listWith is the shared skeleton for inspect-time enumeration. +// Stopped VMs return a nil slice (not an error) so inspect can omit the +// field cleanly. +func listWith[A any]( + ctx context.Context, ch *CloudHypervisor, vmRef string, + extract func(*chVMInfoResponse) []A, +) ([]A, error) { hc, err := ch.runningVMClient(ctx, vmRef) if err != nil { if errors.Is(err, hypervisor.ErrNotRunning) { @@ -193,16 +219,12 @@ func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio if err != nil { return nil, err } - out := make([]vfio.Attached, 0, len(info.Config.Devices)) - for _, d := range info.Config.Devices { - out = append(out, vfio.Attached{ID: d.ID, BDF: bdfFromSysfsPath(d.Path)}) - } - return out, nil + return extract(info), nil } // runningVMClient resolves vmRef, asserts the recorded CH process is still -// alive (PID file + cmdline match — same gate as WithRunningVM), and -// returns an http.Client connected to its CH API socket. +// alive (PID file + cmdline match — same gate as Backend.WithRunningVM), +// and returns an http.Client connected to its CH API socket. func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (*http.Client, error) { vmID, err := ch.ResolveRef(ctx, vmRef) if err != nil { diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index dd4882f..dd06cd8 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -174,30 +174,6 @@ func addNetVM(ctx context.Context, hc *http.Client, net chNet) error { return vmAPI(ctx, hc, "vm.add-net", body, http.StatusOK, http.StatusNoContent) } -func addFsVM(ctx context.Context, hc *http.Client, fs chFs) (chPciDeviceInfo, error) { - body, err := json.Marshal(fs) - if err != nil { - return chPciDeviceInfo{}, fmt.Errorf("marshal add-fs request: %w", err) - } - resp, err := vmAPICall(ctx, hc, "vm.add-fs", body, http.StatusOK, http.StatusNoContent) - if err != nil { - return chPciDeviceInfo{}, err - } - return decodePciDeviceInfo(resp) -} - -func addDeviceVM(ctx context.Context, hc *http.Client, dev chDevice) (chPciDeviceInfo, error) { - body, err := json.Marshal(dev) - if err != nil { - return chPciDeviceInfo{}, fmt.Errorf("marshal add-device request: %w", err) - } - resp, err := vmAPICall(ctx, hc, "vm.add-device", body, http.StatusOK, http.StatusNoContent) - if err != nil { - return chPciDeviceInfo{}, err - } - return decodePciDeviceInfo(resp) -} - // getVMInfo fetches vm.info; cocoon uses it to detect tag/id conflicts // before hot-add and to surface attached devices through inspect. func getVMInfo(ctx context.Context, hc *http.Client) (*chVMInfoResponse, error) { From 8b0c7260ac07b7aa02193743a84762b40347f907 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 09:51:14 +0800 Subject: [PATCH 11/22] cloudhypervisor: extract inspectRunning helper for the three with-helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit attachWith / detachWith / listWith all opened with the same pair — runningVMClient + getVMInfo — so factor it into one inspectRunning call. The change is mechanical: each helper goes from two-step setup to one, and the live-VM gate stays on a single code path. --- hypervisor/cloudhypervisor/extend.go | 41 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index 4aeeb6e..f1f940a 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -139,19 +139,28 @@ func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio }) } -// attachWith is the shared skeleton for hot-add operations: gate on a -// live VM, fetch vm.info, run preCheck (memory/conflict/host-path), call -// the CH endpoint, return the device id (CH-assigned or fallback). +// inspectRunning is the shared bootstrap for every extend op: gate on a +// live VM and grab a fresh vm.info snapshot to feed conflict scans, +// memory checks, or device-id lookups. +func (ch *CloudHypervisor) inspectRunning(ctx context.Context, vmRef string) (*http.Client, *chVMInfoResponse, error) { + hc, err := ch.runningVMClient(ctx, vmRef) + if err != nil { + return nil, nil, err + } + info, err := getVMInfo(ctx, hc) + if err != nil { + return nil, nil, err + } + return hc, info, nil +} + +// attachWith is the shared skeleton for hot-add operations. func (ch *CloudHypervisor) attachWith( ctx context.Context, vmRef, endpoint string, body any, fallbackID string, preCheck func(*chVMInfoResponse) error, ) (string, error) { - hc, err := ch.runningVMClient(ctx, vmRef) - if err != nil { - return "", err - } - info, err := getVMInfo(ctx, hc) + hc, info, err := ch.inspectRunning(ctx, vmRef) if err != nil { return "", err } @@ -176,18 +185,12 @@ func (ch *CloudHypervisor) attachWith( return fallbackID, nil } -// detachWith is the shared skeleton for hot-remove: gate on a live VM, -// fetch vm.info, let the caller find the target device id, then call -// vm.remove-device. +// detachWith is the shared skeleton for hot-remove operations. func (ch *CloudHypervisor) detachWith( ctx context.Context, vmRef string, findID func(*chVMInfoResponse) (string, error), ) error { - hc, err := ch.runningVMClient(ctx, vmRef) - if err != nil { - return err - } - info, err := getVMInfo(ctx, hc) + hc, info, err := ch.inspectRunning(ctx, vmRef) if err != nil { return err } @@ -208,17 +211,13 @@ func listWith[A any]( ctx context.Context, ch *CloudHypervisor, vmRef string, extract func(*chVMInfoResponse) []A, ) ([]A, error) { - hc, err := ch.runningVMClient(ctx, vmRef) + _, info, err := ch.inspectRunning(ctx, vmRef) if err != nil { if errors.Is(err, hypervisor.ErrNotRunning) { return nil, nil } return nil, err } - info, err := getVMInfo(ctx, hc) - if err != nil { - return nil, err - } return extract(info), nil } From 2a0ce2c735b7ffcf8228feb5ac4aec3a77cd6432 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 09:58:07 +0800 Subject: [PATCH 12/22] extend, vm/debug: address two P2 review findings vfio.NormalizePath now rejects absolute paths that fall outside /sys/bus/pci/devices/ and absolute paths whose suffix is not a valid full BDF. Previously cocoon would happily forward /dev/null or any non-PCI directory to vm.add-device. Tests flip the /dev/null case from PASS to expected rejection and add coverage for sysfs-prefixed garbage and short-form BDFs under the prefix. cmd/vm/debug now threads VMConfig.SharedMemory into printCommonCHArgs so 'cocoon vm debug --shared-memory' actually prints --memory ..., shared=on. The check that rejects --fc + --shared-memory in create/run is mirrored on the debug path so the contract is the same across the three entry points. Backend-layer error messages also drop the '--flag' prefix in fs and vfio Spec validation; the CLI flag terminology belongs in the cmd/vm layer, not the extend packages. --- cmd/vm/debug.go | 13 ++++++++++--- extend/fs/fs.go | 12 ++++++------ extend/fs/fs_test.go | 10 +++++----- extend/vfio/vfio.go | 19 +++++++++++++++---- extend/vfio/vfio_test.go | 11 +++++++---- 5 files changed, 43 insertions(+), 22 deletions(-) diff --git a/cmd/vm/debug.go b/cmd/vm/debug.go index 6e1d3be..1f601a5 100644 --- a/cmd/vm/debug.go +++ b/cmd/vm/debug.go @@ -35,6 +35,9 @@ func (h Handler) Debug(cmd *cobra.Command, args []string) error { if err != nil { return err } + if conf.UseFirecracker && vmCfg.SharedMemory { + return fmt.Errorf("--fc and --shared-memory are mutually exclusive: Firecracker does not support vhost-user-fs hot-plug") + } if len(vmCfg.DataDisks) > 0 { fmt.Fprintln(os.Stderr, "warning: --data-disk is ignored in debug mode (debug only prints the hypervisor launch command; data disks need PrepareDataDisks to materialize)") } @@ -191,16 +194,20 @@ func printCHDebug(configs []*types.StorageConfig, boot *types.BootConfig, vmCfg diskArgs := cloudhypervisor.DebugDiskCLIArgs([]*types.StorageConfig{{Path: cowPath, RO: false}}, cpu, diskQueueSize, noDirectIO) fmt.Printf(" \"%s\" \\\n", diskArgs[0]) } - printCommonCHArgs(cpu, maxCPU, memory, balloon, vmCfg.Windows) + printCommonCHArgs(cpu, maxCPU, memory, balloon, vmCfg.Windows, vmCfg.SharedMemory) } -func printCommonCHArgs(cpu, maxCPU, memory, balloon int, windows bool) { +func printCommonCHArgs(cpu, maxCPU, memory, balloon int, windows, sharedMemory bool) { cpuExtra := "" if windows { cpuExtra = ",kvm_hyperv=on" } + memExtra := "" + if sharedMemory { + memExtra = ",shared=on" + } fmt.Printf(" --cpus boot=%d,max=%d%s \\\n", cpu, maxCPU, cpuExtra) - fmt.Printf(" --memory size=%dM \\\n", memory) + fmt.Printf(" --memory size=%dM%s \\\n", memory, memExtra) fmt.Printf(" --rng src=/dev/urandom \\\n") fmt.Printf(" --balloon size=%dM,deflate_on_oom=on,free_page_reporting=on \\\n", balloon) fmt.Printf(" --watchdog \\\n") diff --git a/extend/fs/fs.go b/extend/fs/fs.go index 869ecc5..150e287 100644 --- a/extend/fs/fs.go +++ b/extend/fs/fs.go @@ -59,22 +59,22 @@ type Lister interface { // Validate enforces required fields and applies queue-size defaults. func (s *Spec) Validate() error { if s.Socket == "" { - return fmt.Errorf("--socket is required") + return fmt.Errorf("socket is required") } if !filepath.IsAbs(s.Socket) { - return fmt.Errorf("--socket must be absolute, got %q", s.Socket) + return fmt.Errorf("socket must be absolute, got %q", s.Socket) } if s.Tag == "" { - return fmt.Errorf("--tag is required") + return fmt.Errorf("tag is required") } if !validTagRe.MatchString(s.Tag) { - return fmt.Errorf("--tag %q invalid: must match [A-Za-z0-9][A-Za-z0-9_-]{0,35}", s.Tag) + return fmt.Errorf("tag %q invalid: must match [A-Za-z0-9][A-Za-z0-9_-]{0,35}", s.Tag) } if s.NumQueues < 0 { - return fmt.Errorf("--num-queues must be non-negative, got %d", s.NumQueues) + return fmt.Errorf("num-queues must be non-negative, got %d", s.NumQueues) } if s.QueueSize < 0 { - return fmt.Errorf("--queue-size must be non-negative, got %d", s.QueueSize) + return fmt.Errorf("queue-size must be non-negative, got %d", s.QueueSize) } if s.NumQueues == 0 { s.NumQueues = DefaultNumQueues diff --git a/extend/fs/fs_test.go b/extend/fs/fs_test.go index 6207c77..579ab2e 100644 --- a/extend/fs/fs_test.go +++ b/extend/fs/fs_test.go @@ -15,11 +15,11 @@ func TestSpecValidate(t *testing.T) { }{ {name: "minimal valid", spec: Spec{Socket: "/tmp/x.sock", Tag: "share"}, wantNumQ: DefaultNumQueues, wantQSize: DefaultQueueSize}, {name: "explicit queues", spec: Spec{Socket: "/tmp/x.sock", Tag: "share", NumQueues: 4, QueueSize: 256}, wantNumQ: 4, wantQSize: 256}, - {name: "missing socket", spec: Spec{Tag: "share"}, wantErr: "--socket is required"}, - {name: "relative socket", spec: Spec{Socket: "rel.sock", Tag: "share"}, wantErr: "--socket must be absolute"}, - {name: "missing tag", spec: Spec{Socket: "/tmp/x.sock"}, wantErr: "--tag is required"}, - {name: "tag with slash", spec: Spec{Socket: "/tmp/x.sock", Tag: "a/b"}, wantErr: "--tag"}, - {name: "tag too long", spec: Spec{Socket: "/tmp/x.sock", Tag: strings.Repeat("a", 37)}, wantErr: "--tag"}, + {name: "missing socket", spec: Spec{Tag: "share"}, wantErr: "socket is required"}, + {name: "relative socket", spec: Spec{Socket: "rel.sock", Tag: "share"}, wantErr: "socket must be absolute"}, + {name: "missing tag", spec: Spec{Socket: "/tmp/x.sock"}, wantErr: "tag is required"}, + {name: "tag with slash", spec: Spec{Socket: "/tmp/x.sock", Tag: "a/b"}, wantErr: "tag"}, + {name: "tag too long", spec: Spec{Socket: "/tmp/x.sock", Tag: strings.Repeat("a", 37)}, wantErr: "tag"}, {name: "negative queues", spec: Spec{Socket: "/tmp/x.sock", Tag: "share", NumQueues: -1}, wantErr: "non-negative"}, } for _, tt := range tests { diff --git a/extend/vfio/vfio.go b/extend/vfio/vfio.go index 1ad8fad..372ac7f 100644 --- a/extend/vfio/vfio.go +++ b/extend/vfio/vfio.go @@ -65,23 +65,34 @@ type Lister interface { // and the API call. func (s *Spec) NormalizedPath() (string, error) { if s.PCI == "" { - return "", fmt.Errorf("--pci is required") + return "", fmt.Errorf("pci is required") } if s.ID != "" && (strings.HasPrefix(s.ID, "cocoon-") || !validIDRe.MatchString(s.ID)) { - return "", fmt.Errorf("--id %q invalid: must match [A-Za-z0-9][A-Za-z0-9_.-]{0,63} and not start with cocoon-", s.ID) + return "", fmt.Errorf("id %q invalid: must match [A-Za-z0-9][A-Za-z0-9_.-]{0,63} and not start with cocoon-", s.ID) } return NormalizePath(s.PCI) } // NormalizePath maps any of {01:00.0, 0000:01:00.0, /sys/bus/pci/devices/} // into the canonical sysfs path that CH's vm.add-device expects. +// Absolute paths must live under /sys/bus/pci/devices/ with a valid BDF +// suffix — accepting an arbitrary host path would let cocoon hand CH a +// non-PCI directory. func NormalizePath(input string) (string, error) { in := strings.TrimSpace(input) if in == "" { return "", fmt.Errorf("empty pci path") } if strings.HasPrefix(in, "/") { - return filepath.Clean(in), nil + cleaned := filepath.Clean(in) + if !strings.HasPrefix(cleaned, SysfsPCIPrefix) { + return "", fmt.Errorf("pci %q invalid: absolute path must be under %s", input, SysfsPCIPrefix) + } + bdf := strings.ToLower(strings.TrimPrefix(cleaned, SysfsPCIPrefix)) + if !bdfFullRe.MatchString(bdf) { + return "", fmt.Errorf("pci %q invalid: sysfs BDF suffix must match 0000:xx:yy.z", input) + } + return SysfsPCIPrefix + bdf, nil } low := strings.ToLower(in) switch { @@ -90,5 +101,5 @@ func NormalizePath(input string) (string, error) { case bdfShortRe.MatchString(low): return SysfsPCIPrefix + "0000:" + low, nil } - return "", fmt.Errorf("--pci %q invalid: expect BDF (01:00.0 / 0000:01:00.0) or sysfs path", input) + return "", fmt.Errorf("pci %q invalid: expect BDF (01:00.0 / 0000:01:00.0) or sysfs path", input) } diff --git a/extend/vfio/vfio_test.go b/extend/vfio/vfio_test.go index edcc5e8..f8dd1c2 100644 --- a/extend/vfio/vfio_test.go +++ b/extend/vfio/vfio_test.go @@ -17,7 +17,10 @@ func TestNormalizePath(t *testing.T) { {name: "uppercase BDF", in: "00:1F.7", want: SysfsPCIPrefix + "0000:00:1f.7"}, {name: "sysfs path", in: SysfsPCIPrefix + "0000:01:00.0", want: SysfsPCIPrefix + "0000:01:00.0"}, {name: "sysfs path with trailing slash", in: SysfsPCIPrefix + "0000:01:00.0/", want: SysfsPCIPrefix + "0000:01:00.0"}, - {name: "absolute non-sysfs path", in: "/dev/null", want: "/dev/null"}, + {name: "sysfs path uppercase BDF", in: SysfsPCIPrefix + "0000:0A:00.0", want: SysfsPCIPrefix + "0000:0a:00.0"}, + {name: "absolute non-sysfs path rejected", in: "/dev/null", wantErr: "must be under"}, + {name: "sysfs prefix with garbage suffix", in: SysfsPCIPrefix + "garbage", wantErr: "BDF suffix"}, + {name: "sysfs prefix with short BDF", in: SysfsPCIPrefix + "01:00.0", wantErr: "BDF suffix"}, {name: "empty", in: "", wantErr: "empty"}, {name: "garbage", in: "not-a-bdf", wantErr: "invalid"}, {name: "bad function digit", in: "01:00.8", wantErr: "invalid"}, @@ -53,9 +56,9 @@ func TestSpecNormalizedPath(t *testing.T) { }{ {name: "valid short BDF", spec: Spec{PCI: "01:00.0"}, want: SysfsPCIPrefix + "0000:01:00.0"}, {name: "valid full BDF with id", spec: Spec{PCI: "0000:01:00.0", ID: "mygpu"}, want: SysfsPCIPrefix + "0000:01:00.0"}, - {name: "missing pci", spec: Spec{}, wantErr: "--pci is required"}, - {name: "id with cocoon prefix", spec: Spec{PCI: "01:00.0", ID: "cocoon-x"}, wantErr: "--id"}, - {name: "id with bad chars", spec: Spec{PCI: "01:00.0", ID: "bad id"}, wantErr: "--id"}, + {name: "missing pci", spec: Spec{}, wantErr: "pci is required"}, + {name: "id with cocoon prefix", spec: Spec{PCI: "01:00.0", ID: "cocoon-x"}, wantErr: "id"}, + {name: "id with bad chars", spec: Spec{PCI: "01:00.0", ID: "bad id"}, wantErr: "id"}, {name: "bad pci", spec: Spec{PCI: "junk"}, wantErr: "invalid"}, } for _, tt := range tests { From 98a5dfed87ef806afaaee51b1669482e14c03dcb Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 10:00:01 +0800 Subject: [PATCH 13/22] docs: tighten --pci spec wording, document virtiofsd single-shot lifecycle README: spell out which inputs vm device attach --pci accepts now that NormalizePath rejects arbitrary absolute paths. The previous 'BDF or full sysfs path' phrasing was loose and could read as 'any path'. KNOWN_ISSUES: add a note that upstream virtiofsd exits after one client disconnect, so scripted attach/detach cycles must respawn the daemon between calls. Caught during E2E. --- KNOWN_ISSUES.md | 4 ++++ README.md | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/KNOWN_ISSUES.md b/KNOWN_ISSUES.md index b678647..e08c41b 100644 --- a/KNOWN_ISSUES.md +++ b/KNOWN_ISSUES.md @@ -225,3 +225,7 @@ Cloud Hypervisor refuses to snapshot a VM that holds a vhost-user-fs share or a ## Runtime attached devices do not survive VM stop / clone / restore Attaches via `cocoon vm fs attach` and `cocoon vm device attach` are runtime-only — they live in the CH process state and are never written into the VM record, sidecar, or snapshot. After `vm stop`, `vm clone`, or `vm restore`, the user must re-run the attach commands. `vm inspect` reflects the live CH `vm.info` for running VMs and omits `attached_devices` for stopped VMs. This is by design: cocoon does not own the backend (virtiofsd / vfio-pci binding) and cannot guarantee the resource still exists across host events. + +## virtiofsd is a single-shot daemon + +Upstream virtiofsd serves exactly one vhost-user client and exits when that client disconnects. Consequence: after `cocoon vm fs detach`, the daemon is gone — a follow-up `cocoon vm fs attach` against the same socket path will hang or time out until a fresh `virtiofsd` instance is launched. The same applies after `cocoon vm stop` (CH closes the socket on shutdown). Scripts that cycle attach/detach should respawn virtiofsd between calls. This is a virtiofsd behavior, not a cocoon limitation. diff --git a/README.md b/README.md index bb12561..4c86266 100644 --- a/README.md +++ b/README.md @@ -428,7 +428,9 @@ Prerequisite: host has `intel_iommu=on` (or `amd_iommu=on`) on the kernel comman # Bind the device on the host (one-time per device). echo 0000:01:00.0 > /sys/bus/pci/drivers/vfio-pci/bind # see https://wiki.archlinux.org/title/PCI_passthrough_via_OVMF -# Attach (BDF or full sysfs path both accepted). +# --pci accepts: short BDF (01:00.0), full BDF (0000:01:00.0), or +# sysfs path under /sys/bus/pci/devices/. Other absolute paths are +# rejected so cocoon does not forward a non-PCI directory to CH. cocoon vm device attach my-vm --pci 01:00.0 --id mygpu # Detach. From 8a1095c3e368a9816b1b4ec4dbc9e47b54f3d3ad Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 10:18:40 +0800 Subject: [PATCH 14/22] extend, vm: /code review fixes (vmAPIOnce, omitempty, debug args, TODO) vmAPIOnce: vm.add-fs and vm.add-device are non-idempotent; a retry after CH has already accepted the device but the response was lost would surface as a 'duplicate id' rejection. Skip DoWithRetry on those two endpoints. Existing addDiskVM/addNetVM stay on the retry path (deterministic startup, no user-visible CLI surface today). chFs: align num_queues / queue_size with chDisk and chNet by adding omitempty so a zero value (e.g. from a programmatic caller that skipped Spec.Validate) is dropped rather than rejected by CH. debug: collapse printCommonCHArgs's 6 positional args into a chDebugArgs struct so future flags don't keep widening the signature. lifecycle: mirror the TODO(inspect) note from cloudhypervisor/extend.go so the dual vm.info round-trip is visible at the call site too. extend.go: flatten the 'else if' chain after a return path in DeviceAttach (style). --- cmd/vm/debug.go | 32 ++++++++++++++++----- cmd/vm/lifecycle.go | 4 +++ hypervisor/cloudhypervisor/api.go | 4 +-- hypervisor/cloudhypervisor/extend.go | 11 +++++-- hypervisor/cloudhypervisor/helper.go | 8 ++++++ utils/http.go | 43 +++++++++++++++++++--------- 6 files changed, 77 insertions(+), 25 deletions(-) diff --git a/cmd/vm/debug.go b/cmd/vm/debug.go index 1f601a5..cd8816d 100644 --- a/cmd/vm/debug.go +++ b/cmd/vm/debug.go @@ -194,22 +194,40 @@ func printCHDebug(configs []*types.StorageConfig, boot *types.BootConfig, vmCfg diskArgs := cloudhypervisor.DebugDiskCLIArgs([]*types.StorageConfig{{Path: cowPath, RO: false}}, cpu, diskQueueSize, noDirectIO) fmt.Printf(" \"%s\" \\\n", diskArgs[0]) } - printCommonCHArgs(cpu, maxCPU, memory, balloon, vmCfg.Windows, vmCfg.SharedMemory) + printCommonCHArgs(chDebugArgs{ + CPU: cpu, + MaxCPU: maxCPU, + MemoryMB: memory, + BalloonMB: balloon, + Windows: vmCfg.Windows, + SharedMemory: vmCfg.SharedMemory, + }) } -func printCommonCHArgs(cpu, maxCPU, memory, balloon int, windows, sharedMemory bool) { +// chDebugArgs is the per-call CH command-line shape for the debug printer. +// A struct keeps the printCommonCHArgs signature stable as more flags accrue. +type chDebugArgs struct { + CPU int + MaxCPU int + MemoryMB int + BalloonMB int + Windows bool + SharedMemory bool +} + +func printCommonCHArgs(a chDebugArgs) { cpuExtra := "" - if windows { + if a.Windows { cpuExtra = ",kvm_hyperv=on" } memExtra := "" - if sharedMemory { + if a.SharedMemory { memExtra = ",shared=on" } - fmt.Printf(" --cpus boot=%d,max=%d%s \\\n", cpu, maxCPU, cpuExtra) - fmt.Printf(" --memory size=%dM%s \\\n", memory, memExtra) + fmt.Printf(" --cpus boot=%d,max=%d%s \\\n", a.CPU, a.MaxCPU, cpuExtra) + fmt.Printf(" --memory size=%dM%s \\\n", a.MemoryMB, memExtra) fmt.Printf(" --rng src=/dev/urandom \\\n") - fmt.Printf(" --balloon size=%dM,deflate_on_oom=on,free_page_reporting=on \\\n", balloon) + fmt.Printf(" --balloon size=%dM,deflate_on_oom=on,free_page_reporting=on \\\n", a.BalloonMB) fmt.Printf(" --watchdog \\\n") fmt.Printf(" --serial tty --console off\n") } diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 1b90946..860d8a1 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -115,6 +115,10 @@ func (h Handler) Inspect(cmd *cobra.Command, args []string) error { // collectAttachedDevices reads runtime fs/vfio device lists from the // backend. Errors are logged and dropped — inspect should not fail just // because vm.info is briefly unreachable. +// +// TODO(inspect): the two Lister calls each fetch their own vm.info. A +// combined Lister in extend/ would let inspect pay one HTTP round-trip +// instead of two. Mirrored on cloudhypervisor/extend.go FsList/DeviceList. func collectAttachedDevices(ctx context.Context, hyper hypervisor.Hypervisor, ref string) *attachedDevices { logger := log.WithFunc("cmd.vm.inspect") out := &attachedDevices{} diff --git a/hypervisor/cloudhypervisor/api.go b/hypervisor/cloudhypervisor/api.go index df8c2c4..b4b5b9b 100644 --- a/hypervisor/cloudhypervisor/api.go +++ b/hypervisor/cloudhypervisor/api.go @@ -87,8 +87,8 @@ type chFs struct { ID string `json:"id,omitempty"` Tag string `json:"tag"` Socket string `json:"socket"` - NumQueues int `json:"num_queues"` - QueueSize int `json:"queue_size"` + NumQueues int `json:"num_queues,omitempty"` + QueueSize int `json:"queue_size,omitempty"` } type chDevice struct { diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index f1f940a..e54235b 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -94,9 +94,11 @@ func (ch *CloudHypervisor) DeviceAttach(ctx context.Context, vmRef string, spec // host stat happens after the running-VM gate inside attachWith, // so a stopped VM reports the VM-state error instead of misleading // host path output. - if st, statErr := os.Stat(path); statErr != nil { + st, statErr := os.Stat(path) + if statErr != nil { return fmt.Errorf("pci path %s: %w", path, statErr) - } else if !st.IsDir() { + } + if !st.IsDir() { return fmt.Errorf("pci path %s: not a directory", path) } for _, ex := range info.Config.Devices { @@ -171,7 +173,10 @@ func (ch *CloudHypervisor) attachWith( if err != nil { return "", fmt.Errorf("marshal %s: %w", endpoint, err) } - resp, err := vmAPICall(ctx, hc, endpoint, bodyBytes, http.StatusOK, http.StatusNoContent) + // vm.add-fs / vm.add-device are not idempotent: a retry after CH already + // accepted the device but the response was lost would echo back as a + // misleading "duplicate id" rejection. vmAPIOnce skips the retry layer. + resp, err := vmAPIOnce(ctx, hc, endpoint, bodyBytes, http.StatusOK, http.StatusNoContent) if err != nil { return "", fmt.Errorf("%s: %w", endpoint, err) } diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index dd06cd8..9d0beec 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -104,6 +104,14 @@ func vmAPICall(ctx context.Context, hc *http.Client, endpoint string, body []byt return utils.DoAPIWithRetry(ctx, hc, http.MethodPut, "http://localhost/api/v1/"+endpoint, body, successCodes...) } +// vmAPIOnce sends a single PUT without DoWithRetry. Use for non-idempotent +// endpoints — e.g. vm.add-fs / vm.add-device — where a retry after a +// network drop or 5xx that landed on CH after the device was already added +// would surface as a misleading "duplicate id" rejection. +func vmAPIOnce(ctx context.Context, hc *http.Client, endpoint string, body []byte, successCodes ...int) ([]byte, error) { + return utils.DoAPIOnce(ctx, hc, http.MethodPut, "http://localhost/api/v1/"+endpoint, body, successCodes...) +} + func shutdownVM(ctx context.Context, hc *http.Client) error { return vmAPI(ctx, hc, "vm.shutdown", nil) } diff --git a/utils/http.go b/utils/http.go index 1b4eceb..4099130 100644 --- a/utils/http.go +++ b/utils/http.go @@ -133,19 +133,36 @@ func IsRetryable(err error) bool { // successCodes[0] is the primary code passed to DoAPI; codes[1:] are accepted // as success on retry (silent nil-body return). func DoAPIWithRetry(ctx context.Context, hc *http.Client, method, url string, body []byte, successCodes ...int) ([]byte, error) { - if len(successCodes) == 0 { - successCodes = []int{http.StatusNoContent} - } - primary := successCodes[0] + primary, alt := splitSuccessCodes(successCodes) return DoWithRetry(ctx, func() ([]byte, error) { - resp, apiErr := DoAPI(ctx, hc, method, url, body, primary) - if apiErr == nil { - return resp, nil - } - var ae *APIError - if errors.As(apiErr, &ae) && slices.Contains(successCodes[1:], ae.Code) { - return nil, nil - } - return nil, apiErr + return doAPIAcceptingAlt(ctx, hc, method, url, body, primary, alt) }) } + +// DoAPIOnce sends a single request without DoWithRetry. Use for endpoints +// whose action is non-idempotent: a retry after the request landed but the +// response was lost would surface as a duplicate / conflict error rather +// than a real failure (e.g. CH vm.add-fs / vm.add-device). +func DoAPIOnce(ctx context.Context, hc *http.Client, method, url string, body []byte, successCodes ...int) ([]byte, error) { + primary, alt := splitSuccessCodes(successCodes) + return doAPIAcceptingAlt(ctx, hc, method, url, body, primary, alt) +} + +func splitSuccessCodes(codes []int) (primary int, alt []int) { + if len(codes) == 0 { + return http.StatusNoContent, nil + } + return codes[0], codes[1:] +} + +func doAPIAcceptingAlt(ctx context.Context, hc *http.Client, method, url string, body []byte, primary int, alt []int) ([]byte, error) { + resp, apiErr := DoAPI(ctx, hc, method, url, body, primary) + if apiErr == nil { + return resp, nil + } + var ae *APIError + if errors.As(apiErr, &ae) && slices.Contains(alt, ae.Code) { + return nil, nil + } + return nil, apiErr +} From 36a23ea14173337d83e20ccfc68a4f424484cb56 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 10:53:26 +0800 Subject: [PATCH 15/22] =?UTF-8?q?extend,=20vm:=20review-followup=20?= =?UTF-8?q?=E2=80=94=20flatten=20attachGroup,=20Normalize,=20pidfile=20err?= =?UTF-8?q?or?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cmd/vm/commands.go: drop attachGroup struct + build() in favor of two inline buildFsCommand/buildDeviceCommand bodies. The struct's only job was to group config strings and flag callbacks for two callers, but their flag setup never actually overlapped — the indirection added ~30 lines without dedup'ing anything. Plain cobra construction reads top-to-bottom now. extend/fs: rename Spec.Validate to Spec.Normalize. The function applies defaults to the receiver, so the old name lied about purity. Mirrors vfio.Spec.NormalizedPath in spirit. cloudhypervisor/extend.go runningVMClient: surface a missing pidfile as 'read pidfile: ' instead of silently degrading to 'pid 0'. --- cmd/vm/commands.go | 106 ++++++++++++--------------- extend/fs/fs.go | 9 ++- extend/fs/fs_test.go | 4 +- hypervisor/cloudhypervisor/extend.go | 7 +- 4 files changed, 59 insertions(+), 67 deletions(-) diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index 52da71b..6cfebab 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -162,85 +162,69 @@ func Command(h Actions) *cobra.Command { return vmCmd } -// attachGroup describes a parent command that bundles attach/detach -// subcommands sharing the same VM-arg shape and JSON-output flag. -type attachGroup struct { - parent string - parentDesc string - attachDesc string - detachDesc string - attachRun func(*cobra.Command, []string) error - detachRun func(*cobra.Command, []string) error - attachFlag func(*cobra.Command) - detachFlag func(*cobra.Command) -} - -func (g attachGroup) build() *cobra.Command { - parent := &cobra.Command{Use: g.parent, Short: g.parentDesc} +func buildFsCommand(h Actions) *cobra.Command { + parent := &cobra.Command{ + Use: "fs", + Short: "Attach/detach a vhost-user-fs share to a running VM (CH only)", + } attach := &cobra.Command{ Use: "attach VM", - Short: g.attachDesc, + Short: "Attach a vhost-user-fs device to a running VM", Args: cobra.ExactArgs(1), - RunE: g.attachRun, + RunE: h.FsAttach, } - g.attachFlag(attach) + attach.Flags().String("socket", "", "absolute path to a virtiofsd unix socket (required)") + attach.Flags().String("tag", "", "guest mount tag (required; also detach key)") + attach.Flags().Int("num-queues", 0, "request queues (0 = default 1)") + attach.Flags().Int("queue-size", 0, "queue depth (0 = default 1024)") //nolint:mnd + _ = attach.MarkFlagRequired("socket") + _ = attach.MarkFlagRequired("tag") cmdcore.AddOutputFlag(attach) detach := &cobra.Command{ Use: "detach VM", - Short: g.detachDesc, + Short: "Detach a vhost-user-fs device from a running VM", Args: cobra.ExactArgs(1), - RunE: g.detachRun, + RunE: h.FsDetach, } - g.detachFlag(detach) + detach.Flags().String("tag", "", "guest mount tag (required)") + _ = detach.MarkFlagRequired("tag") cmdcore.AddOutputFlag(detach) parent.AddCommand(attach, detach) return parent } -func buildFsCommand(h Actions) *cobra.Command { - return attachGroup{ - parent: "fs", - parentDesc: "Attach/detach a vhost-user-fs share to a running VM (CH only)", - attachDesc: "Attach a vhost-user-fs device to a running VM", - detachDesc: "Detach a vhost-user-fs device from a running VM", - attachRun: h.FsAttach, - detachRun: h.FsDetach, - attachFlag: func(c *cobra.Command) { - c.Flags().String("socket", "", "absolute path to a virtiofsd unix socket (required)") - c.Flags().String("tag", "", "guest mount tag (required; also detach key)") - c.Flags().Int("num-queues", 0, "request queues (0 = default 1)") - c.Flags().Int("queue-size", 0, "queue depth (0 = default 1024)") //nolint:mnd - _ = c.MarkFlagRequired("socket") - _ = c.MarkFlagRequired("tag") - }, - detachFlag: func(c *cobra.Command) { - c.Flags().String("tag", "", "guest mount tag (required)") - _ = c.MarkFlagRequired("tag") - }, - }.build() -} - func buildDeviceCommand(h Actions) *cobra.Command { - return attachGroup{ - parent: "device", - parentDesc: "Attach/detach a VFIO PCI passthrough device to a running VM (CH only)", - attachDesc: "Attach a VFIO PCI device to a running VM", - detachDesc: "Detach a VFIO PCI device from a running VM", - attachRun: h.DeviceAttach, - detachRun: h.DeviceDetach, - attachFlag: func(c *cobra.Command) { - c.Flags().String("pci", "", "BDF (01:00.0 / 0000:01:00.0) or sysfs path /sys/bus/pci/devices/") - c.Flags().String("id", "", "optional device id; CH auto-generates if empty (must not start with cocoon-)") - _ = c.MarkFlagRequired("pci") - }, - detachFlag: func(c *cobra.Command) { - c.Flags().String("id", "", "device id returned by attach (required)") - _ = c.MarkFlagRequired("id") - }, - }.build() + parent := &cobra.Command{ + Use: "device", + Short: "Attach/detach a VFIO PCI passthrough device to a running VM (CH only)", + } + + attach := &cobra.Command{ + Use: "attach VM", + Short: "Attach a VFIO PCI device to a running VM", + Args: cobra.ExactArgs(1), + RunE: h.DeviceAttach, + } + attach.Flags().String("pci", "", "BDF (01:00.0 / 0000:01:00.0) or sysfs path /sys/bus/pci/devices/") + attach.Flags().String("id", "", "optional device id; CH auto-generates if empty (must not start with cocoon-)") + _ = attach.MarkFlagRequired("pci") + cmdcore.AddOutputFlag(attach) + + detach := &cobra.Command{ + Use: "detach VM", + Short: "Detach a VFIO PCI device from a running VM", + Args: cobra.ExactArgs(1), + RunE: h.DeviceDetach, + } + detach.Flags().String("id", "", "device id returned by attach (required)") + _ = detach.MarkFlagRequired("id") + cmdcore.AddOutputFlag(detach) + + parent.AddCommand(attach, detach) + return parent } func addVMFlags(cmd *cobra.Command) { diff --git a/extend/fs/fs.go b/extend/fs/fs.go index 150e287..45dc736 100644 --- a/extend/fs/fs.go +++ b/extend/fs/fs.go @@ -56,8 +56,13 @@ type Lister interface { FsList(ctx context.Context, vmRef string) ([]Attached, error) } -// Validate enforces required fields and applies queue-size defaults. -func (s *Spec) Validate() error { +// Normalize enforces required fields and applies queue-size defaults. +// Mutates the receiver — callers expect the post-call Spec to be ready +// for serialization. Mirror vfio.Spec.NormalizedPath in spirit (compute +// + validate in one call) so future Specs in extend/ converge on the +// same idiom rather than fan out into pure-Validate vs. mutating-Validate +// styles. +func (s *Spec) Normalize() error { if s.Socket == "" { return fmt.Errorf("socket is required") } diff --git a/extend/fs/fs_test.go b/extend/fs/fs_test.go index 579ab2e..80989ca 100644 --- a/extend/fs/fs_test.go +++ b/extend/fs/fs_test.go @@ -5,7 +5,7 @@ import ( "testing" ) -func TestSpecValidate(t *testing.T) { +func TestSpecNormalize(t *testing.T) { tests := []struct { name string spec Spec @@ -24,7 +24,7 @@ func TestSpecValidate(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - err := tt.spec.Validate() + err := tt.spec.Normalize() if tt.wantErr != "" { if err == nil { t.Fatalf("want error containing %q, got nil", tt.wantErr) diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index e54235b..022bb26 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -25,7 +25,7 @@ var ( // FsAttach hot-plugs a vhost-user-fs device onto a running CH VM. func (ch *CloudHypervisor) FsAttach(ctx context.Context, vmRef string, spec fs.Spec) (string, error) { - if err := spec.Validate(); err != nil { + if err := spec.Normalize(); err != nil { return "", err } id := fs.DeriveID(spec.Tag) @@ -242,7 +242,10 @@ func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (* return nil, fmt.Errorf("vm %s is %s: %w", vmID, rec.State, hypervisor.ErrNotRunning) } sockPath := hypervisor.SocketPath(rec.RunDir) - pid, _ := utils.ReadPIDFile(ch.PIDFilePath(rec.RunDir)) + pid, pidErr := utils.ReadPIDFile(ch.PIDFilePath(rec.RunDir)) + if pidErr != nil { + return nil, fmt.Errorf("vm %s read pidfile: %v: %w", vmID, pidErr, hypervisor.ErrNotRunning) + } if !utils.VerifyProcessCmdline(pid, ch.conf.BinaryName(), sockPath) { return nil, fmt.Errorf("vm %s pid %d not %s: %w", vmID, pid, ch.conf.BinaryName(), hypervisor.ErrNotRunning) } From 4be694366a1bc719ab82072a4a7d74e95cc5a2b4 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 11:12:11 +0800 Subject: [PATCH 16/22] extend, vm: reorder methods/standalone funcs per SKILL.md Three files I introduced/touched had standalone funcs interleaved with struct methods. SKILL.md mandates 'methods grouped, then standalone functions at the bottom': - cloudhypervisor/extend.go: listWith was placed between detachWith and runningVMClient methods. Moved to bottom alongside bdfFromSysfsPath. - cmd/vm/lifecycle.go: collectAttachedDevices was inserted between Inspect and Console handler methods. Moved to bottom of the file. - cmd/vm/debug.go: chDebugArgs type was declared between printCHDebug and printCommonCHArgs. Moved to top with the other type declarations. Self-review missed these during the /code walkthrough; flagged by the user. --- cmd/vm/debug.go | 22 +++++----- cmd/vm/lifecycle.go | 60 ++++++++++++++-------------- hypervisor/cloudhypervisor/extend.go | 34 ++++++++-------- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/cmd/vm/debug.go b/cmd/vm/debug.go index cd8816d..6e06732 100644 --- a/cmd/vm/debug.go +++ b/cmd/vm/debug.go @@ -15,6 +15,17 @@ import ( "github.com/cocoonstack/cocoon/types" ) +// chDebugArgs is the per-call CH command-line shape for the debug printer. +// A struct keeps the printCommonCHArgs signature stable as more flags accrue. +type chDebugArgs struct { + CPU int + MaxCPU int + MemoryMB int + BalloonMB int + Windows bool + SharedMemory bool +} + func (h Handler) Debug(cmd *cobra.Command, args []string) error { ctx, conf, err := h.Init(cmd) if err != nil { @@ -204,17 +215,6 @@ func printCHDebug(configs []*types.StorageConfig, boot *types.BootConfig, vmCfg }) } -// chDebugArgs is the per-call CH command-line shape for the debug printer. -// A struct keeps the printCommonCHArgs signature stable as more flags accrue. -type chDebugArgs struct { - CPU int - MaxCPU int - MemoryMB int - BalloonMB int - Windows bool - SharedMemory bool -} - func printCommonCHArgs(a chDebugArgs) { cpuExtra := "" if a.Windows { diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 860d8a1..21e3a42 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -112,36 +112,6 @@ func (h Handler) Inspect(cmd *cobra.Command, args []string) error { return cmdcore.OutputJSON(out) } -// collectAttachedDevices reads runtime fs/vfio device lists from the -// backend. Errors are logged and dropped — inspect should not fail just -// because vm.info is briefly unreachable. -// -// TODO(inspect): the two Lister calls each fetch their own vm.info. A -// combined Lister in extend/ would let inspect pay one HTTP round-trip -// instead of two. Mirrored on cloudhypervisor/extend.go FsList/DeviceList. -func collectAttachedDevices(ctx context.Context, hyper hypervisor.Hypervisor, ref string) *attachedDevices { - logger := log.WithFunc("cmd.vm.inspect") - out := &attachedDevices{} - if l, ok := hyper.(fs.Lister); ok { - if devs, err := l.FsList(ctx, ref); err != nil { - logger.Warnf(ctx, "list fs devices for %s: %v", ref, err) - } else { - out.Fs = devs - } - } - if l, ok := hyper.(vfio.Lister); ok { - if devs, err := l.DeviceList(ctx, ref); err != nil { - logger.Warnf(ctx, "list vfio devices for %s: %v", ref, err) - } else { - out.Devices = devs - } - } - if len(out.Fs) == 0 && len(out.Devices) == 0 { - return nil - } - return out -} - func (h Handler) Console(cmd *cobra.Command, args []string) error { ctx, conf, err := h.Init(cmd) if err != nil { @@ -345,3 +315,33 @@ func batchRoutedCmd(ctx context.Context, cmd *cobra.Command, name, pastTense str } return nil } + +// collectAttachedDevices reads runtime fs/vfio device lists from the +// backend. Errors are logged and dropped — inspect should not fail just +// because vm.info is briefly unreachable. +// +// TODO(inspect): the two Lister calls each fetch their own vm.info. A +// combined Lister in extend/ would let inspect pay one HTTP round-trip +// instead of two. Mirrored on cloudhypervisor/extend.go FsList/DeviceList. +func collectAttachedDevices(ctx context.Context, hyper hypervisor.Hypervisor, ref string) *attachedDevices { + logger := log.WithFunc("cmd.vm.inspect") + out := &attachedDevices{} + if l, ok := hyper.(fs.Lister); ok { + if devs, err := l.FsList(ctx, ref); err != nil { + logger.Warnf(ctx, "list fs devices for %s: %v", ref, err) + } else { + out.Fs = devs + } + } + if l, ok := hyper.(vfio.Lister); ok { + if devs, err := l.DeviceList(ctx, ref); err != nil { + logger.Warnf(ctx, "list vfio devices for %s: %v", ref, err) + } else { + out.Devices = devs + } + } + if len(out.Fs) == 0 && len(out.Devices) == 0 { + return nil + } + return out +} diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index 022bb26..1d80dcc 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -209,23 +209,6 @@ func (ch *CloudHypervisor) detachWith( return nil } -// listWith is the shared skeleton for inspect-time enumeration. -// Stopped VMs return a nil slice (not an error) so inspect can omit the -// field cleanly. -func listWith[A any]( - ctx context.Context, ch *CloudHypervisor, vmRef string, - extract func(*chVMInfoResponse) []A, -) ([]A, error) { - _, info, err := ch.inspectRunning(ctx, vmRef) - if err != nil { - if errors.Is(err, hypervisor.ErrNotRunning) { - return nil, nil - } - return nil, err - } - return extract(info), nil -} - // runningVMClient resolves vmRef, asserts the recorded CH process is still // alive (PID file + cmdline match — same gate as Backend.WithRunningVM), // and returns an http.Client connected to its CH API socket. @@ -252,6 +235,23 @@ func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (* return utils.NewSocketHTTPClient(sockPath), nil } +// listWith is the shared skeleton for inspect-time enumeration. +// Stopped VMs return a nil slice (not an error) so inspect can omit the +// field cleanly. +func listWith[A any]( + ctx context.Context, ch *CloudHypervisor, vmRef string, + extract func(*chVMInfoResponse) []A, +) ([]A, error) { + _, info, err := ch.inspectRunning(ctx, vmRef) + if err != nil { + if errors.Is(err, hypervisor.ErrNotRunning) { + return nil, nil + } + return nil, err + } + return extract(info), nil +} + // bdfFromSysfsPath returns the BDF suffix when path is under the canonical // sysfs PCI prefix; empty otherwise (CH may report a non-PCI host path). func bdfFromSysfsPath(p string) string { From 25d945440afc444293732912ebf6ebba7a42c0cb Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 11:55:17 +0800 Subject: [PATCH 17/22] extend, vm: simplify utils/http helpers and route snapshot/restore to DoAPIOnce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit splitSuccessCodes returned a (primary, alt) tuple that both callers fanned right back out into doAPIAcceptingAlt — three helpers for what is one operation. Inline into a single doAPI(...successCodes) that hides the default + alt-tolerance, and have DoAPIWithRetry / DoAPIOnce call it directly. snapshot/restore on both backends now uses DoAPIOnce instead of raw DoAPI. Behavior is unchanged (no alt codes, no retry either way), but the named helper makes the 'this is non-idempotent, do not retry' intent uniform with vm.add-fs / vm.add-device which already use vmAPIOnce. Comments updated to explain the *why* (multi-GiB memory transfer, partial state.json risk) instead of citing the old fcAPI/vmAPI retry layer. --- hypervisor/cloudhypervisor/helper.go | 4 ++-- hypervisor/firecracker/api.go | 9 ++++---- hypervisor/{snapshot_meta.go => snapshot.go} | 0 utils/http.go | 24 +++++++++----------- 4 files changed, 18 insertions(+), 19 deletions(-) rename hypervisor/{snapshot_meta.go => snapshot.go} (100%) diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 9d0beec..4139cc1 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -135,7 +135,7 @@ func snapshotVM(ctx context.Context, hc *http.Client, destDir string) error { if err != nil { return fmt.Errorf("marshal snapshot request: %w", err) } - _, err = utils.DoAPI(ctx, hc, http.MethodPut, + _, err = utils.DoAPIOnce(ctx, hc, http.MethodPut, "http://localhost/api/v1/vm.snapshot", body, http.StatusNoContent) return err } @@ -153,7 +153,7 @@ func restoreVM(ctx context.Context, hc *http.Client, sourceDir string, onDemand if err != nil { return fmt.Errorf("marshal restore request: %w", err) } - _, err = utils.DoAPI(ctx, hc, http.MethodPut, + _, err = utils.DoAPIOnce(ctx, hc, http.MethodPut, "http://localhost/api/v1/vm.restore", body, http.StatusNoContent) return err } diff --git a/hypervisor/firecracker/api.go b/hypervisor/firecracker/api.go index ca4040e..fbc0ddc 100644 --- a/hypervisor/firecracker/api.go +++ b/hypervisor/firecracker/api.go @@ -180,7 +180,8 @@ func resumeVM(ctx context.Context, hc *http.Client) error { } // createSnapshotFC creates a full VM snapshot (vmstate + memory file) in destDir. -// Bypasses fcAPI's retry layer — see hypervisor.VMMemTransferTimeout. +// Bypasses retry — memory transfer takes minutes; resending after a transient +// error would re-transfer multi-GiB and overwrite a partial state.json. func createSnapshotFC(ctx context.Context, sockPath, destDir string) error { body, err := json.Marshal(fcSnapshotCreate{ SnapshotPath: filepath.Join(destDir, snapshotVMStateFile), @@ -190,14 +191,14 @@ func createSnapshotFC(ctx context.Context, sockPath, destDir string) error { return fmt.Errorf("marshal snapshot/create request: %w", err) } hc := utils.NewSocketHTTPClientWithTimeout(sockPath, hypervisor.VMMemTransferTimeout) - _, err = utils.DoAPI(ctx, hc, http.MethodPut, + _, err = utils.DoAPIOnce(ctx, hc, http.MethodPut, "http://localhost/snapshot/create", body, http.StatusNoContent) return err } // loadSnapshotFC loads a VM snapshot from sourceDir into a freshly started FC process. // networkOverrides replaces TAP devices from the snapshot with new ones. -// Bypasses fcAPI's retry layer — see hypervisor.VMMemTransferTimeout. +// Bypasses retry — same memory-transfer reasoning as createSnapshotFC. func loadSnapshotFC(ctx context.Context, sockPath, sourceDir string, networkOverrides []fcNetworkOverride) error { body, err := json.Marshal(fcSnapshotLoad{ SnapshotPath: filepath.Join(sourceDir, snapshotVMStateFile), @@ -211,7 +212,7 @@ func loadSnapshotFC(ctx context.Context, sockPath, sourceDir string, networkOver return fmt.Errorf("marshal snapshot/load request: %w", err) } hc := utils.NewSocketHTTPClientWithTimeout(sockPath, hypervisor.VMMemTransferTimeout) - _, err = utils.DoAPI(ctx, hc, http.MethodPut, + _, err = utils.DoAPIOnce(ctx, hc, http.MethodPut, "http://localhost/snapshot/load", body, http.StatusNoContent) return err } diff --git a/hypervisor/snapshot_meta.go b/hypervisor/snapshot.go similarity index 100% rename from hypervisor/snapshot_meta.go rename to hypervisor/snapshot.go diff --git a/utils/http.go b/utils/http.go index 4099130..4169270 100644 --- a/utils/http.go +++ b/utils/http.go @@ -133,35 +133,33 @@ func IsRetryable(err error) bool { // successCodes[0] is the primary code passed to DoAPI; codes[1:] are accepted // as success on retry (silent nil-body return). func DoAPIWithRetry(ctx context.Context, hc *http.Client, method, url string, body []byte, successCodes ...int) ([]byte, error) { - primary, alt := splitSuccessCodes(successCodes) return DoWithRetry(ctx, func() ([]byte, error) { - return doAPIAcceptingAlt(ctx, hc, method, url, body, primary, alt) + return doAPI(ctx, hc, method, url, body, successCodes...) }) } // DoAPIOnce sends a single request without DoWithRetry. Use for endpoints // whose action is non-idempotent: a retry after the request landed but the // response was lost would surface as a duplicate / conflict error rather -// than a real failure (e.g. CH vm.add-fs / vm.add-device). +// than a real failure (e.g. CH vm.add-fs / vm.add-device, snapshot/create). func DoAPIOnce(ctx context.Context, hc *http.Client, method, url string, body []byte, successCodes ...int) ([]byte, error) { - primary, alt := splitSuccessCodes(successCodes) - return doAPIAcceptingAlt(ctx, hc, method, url, body, primary, alt) + return doAPI(ctx, hc, method, url, body, successCodes...) } -func splitSuccessCodes(codes []int) (primary int, alt []int) { - if len(codes) == 0 { - return http.StatusNoContent, nil +// doAPI is DoAPI with two conveniences: it defaults to 204 No Content when +// successCodes is empty, and accepts successCodes[1:] as success (returning +// a nil body, mirroring the retry path's "tolerated alt" contract). +func doAPI(ctx context.Context, hc *http.Client, method, url string, body []byte, successCodes ...int) ([]byte, error) { + primary := http.StatusNoContent + if len(successCodes) > 0 { + primary = successCodes[0] } - return codes[0], codes[1:] -} - -func doAPIAcceptingAlt(ctx context.Context, hc *http.Client, method, url string, body []byte, primary int, alt []int) ([]byte, error) { resp, apiErr := DoAPI(ctx, hc, method, url, body, primary) if apiErr == nil { return resp, nil } var ae *APIError - if errors.As(apiErr, &ae) && slices.Contains(alt, ae.Code) { + if errors.As(apiErr, &ae) && len(successCodes) > 1 && slices.Contains(successCodes[1:], ae.Code) { return nil, nil } return nil, apiErr From 17813b0b185c67af1bdbf98b664d5334b8a505aa Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 12:09:14 +0800 Subject: [PATCH 18/22] cloudhypervisor: route removeDeviceVM through vmAPIOnce vm.remove-device is non-idempotent: if CH detaches the device but the response is lost, the DoAPIWithRetry retry hits 'id not found' and the user sees a failure for an operation that actually succeeded. This is the same failure mode the previous review pass fixed for vm.add-fs / vm.add-device. Route removeDeviceVM through vmAPIOnce for consistent semantics across attach and detach hot paths. Callers are clone.go's hotSwapNets (admin path), extend.go's FsDetach and DeviceDetach (user-initiated CLI). Last two are the ones that surface the bad UX directly. --- hypervisor/cloudhypervisor/helper.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 4139cc1..1d29952 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -166,12 +166,17 @@ func addDiskVM(ctx context.Context, hc *http.Client, disk chDisk) error { return vmAPI(ctx, hc, "vm.add-disk", body, http.StatusOK, http.StatusNoContent) } +// removeDeviceVM is non-idempotent: a retry after CH already detached the +// device but the response was lost would surface as "id not found" and mask +// the original success. Route through vmAPIOnce, same shape as the add-* hot +// paths. func removeDeviceVM(ctx context.Context, hc *http.Client, deviceID string) error { body, err := json.Marshal(map[string]string{"id": deviceID}) if err != nil { return fmt.Errorf("marshal remove-device request: %w", err) } - return vmAPI(ctx, hc, "vm.remove-device", body) + _, err = vmAPIOnce(ctx, hc, "vm.remove-device", body) + return err } func addNetVM(ctx context.Context, hc *http.Client, net chNet) error { From 3830b73d7d7e559c973ae8b0698a494ad6b82ec5 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 12:16:31 +0800 Subject: [PATCH 19/22] cloudhypervisor, firecracker: route non-idempotent endpoints through *APIOnce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same correctness reasoning as removeDeviceVM and the earlier vm.add-fs / vm.add-device pass: a retry after a state-mutating endpoint already succeeded but the response was lost surfaces as a wrong-state error ("already paused", "already shut down", "duplicate id") and masks the original success. Migrated to vmAPIOnce / fcAPIOnce: CH shutdownVM, pauseVM, resumeVM (used by stop and snapshot/restore) CH addDiskVM, addNetVM (used by clone after vm.restore) FC pauseVM, resumeVM (PATCH /vm) FC instanceStart (PUT /actions InstanceStart) Kept on retry-enabled fcAPI / vmAPI (pre-boot config or semi-idempotent): FC putMachineConfig, putBootSource, putDrive, putNetworkInterface, putBalloon, putEntropy — pre-boot PUT, retry overwrites cleanly FC sendCtrlAltDel — repeating a key event is harmless CH powerButton — ACPI button press, no-op if already off Side effects: - fcAPI loses its method param (always PUT now) and successCodes (always 204). Trim the signature to keep unparam happy. If a future caller needs alt codes, route through DoAPIWithRetry directly. - New fcAPIOnce keeps the method param because pause/resume use PATCH while instanceStart uses PUT. --- hypervisor/cloudhypervisor/helper.go | 22 ++++++++++---- hypervisor/firecracker/api.go | 45 ++++++++++++++++++---------- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 1d29952..69c3822 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -112,16 +112,22 @@ func vmAPIOnce(ctx context.Context, hc *http.Client, endpoint string, body []byt return utils.DoAPIOnce(ctx, hc, http.MethodPut, "http://localhost/api/v1/"+endpoint, body, successCodes...) } +// shutdownVM, pauseVM, resumeVM are all CH state transitions; a second call +// in the wrong state returns an error. Route through vmAPIOnce so a retry +// after a lost response cannot mask the original success. func shutdownVM(ctx context.Context, hc *http.Client) error { - return vmAPI(ctx, hc, "vm.shutdown", nil) + _, err := vmAPIOnce(ctx, hc, "vm.shutdown", nil) + return err } func pauseVM(ctx context.Context, hc *http.Client) error { - return vmAPI(ctx, hc, "vm.pause", nil) + _, err := vmAPIOnce(ctx, hc, "vm.pause", nil) + return err } func resumeVM(ctx context.Context, hc *http.Client) error { - return vmAPI(ctx, hc, "vm.resume", nil) + _, err := vmAPIOnce(ctx, hc, "vm.resume", nil) + return err } // snapshotVM and restoreVM temporarily extend the client timeout for @@ -158,12 +164,17 @@ func restoreVM(ctx context.Context, hc *http.Client, sourceDir string, onDemand return err } +// addDiskVM and addNetVM are the same non-idempotent shape as add-fs/add-device +// — a retry after CH already attached the device hits "duplicate id" and +// masks the original success. Used during clone (cidata hot-plug, NIC swap) +// after vm.restore. func addDiskVM(ctx context.Context, hc *http.Client, disk chDisk) error { body, err := json.Marshal(disk) if err != nil { return fmt.Errorf("marshal add-disk request: %w", err) } - return vmAPI(ctx, hc, "vm.add-disk", body, http.StatusOK, http.StatusNoContent) + _, err = vmAPIOnce(ctx, hc, "vm.add-disk", body, http.StatusOK, http.StatusNoContent) + return err } // removeDeviceVM is non-idempotent: a retry after CH already detached the @@ -184,7 +195,8 @@ func addNetVM(ctx context.Context, hc *http.Client, net chNet) error { if err != nil { return fmt.Errorf("marshal add-net request: %w", err) } - return vmAPI(ctx, hc, "vm.add-net", body, http.StatusOK, http.StatusNoContent) + _, err = vmAPIOnce(ctx, hc, "vm.add-net", body, http.StatusOK, http.StatusNoContent) + return err } // getVMInfo fetches vm.info; cocoon uses it to detect tag/id conflicts diff --git a/hypervisor/firecracker/api.go b/hypervisor/firecracker/api.go index fbc0ddc..9119343 100644 --- a/hypervisor/firecracker/api.go +++ b/hypervisor/firecracker/api.go @@ -95,9 +95,22 @@ type fcSnapshotMemBE struct { BackendType string `json:"backend_type"` } -// fcAPI sends a request to the Firecracker REST API via Unix socket. -func fcAPI(ctx context.Context, hc *http.Client, method, endpoint string, body []byte, successCodes ...int) error { - _, err := utils.DoAPIWithRetry(ctx, hc, method, "http://localhost"+endpoint, body, successCodes...) +// fcAPI PUTs body to a Firecracker REST endpoint with retry. Use only for +// idempotent endpoints (pre-boot config — putMachineConfig, putBootSource, +// putDrive, putNetworkInterface, putBalloon, putEntropy — where a retry +// PUT just overwrites the same config). All current callers expect 204; if +// a future endpoint needs alt success codes, switch to DoAPIWithRetry +// directly instead of widening this wrapper. +func fcAPI(ctx context.Context, hc *http.Client, endpoint string, body []byte) error { + _, err := utils.DoAPIWithRetry(ctx, hc, http.MethodPut, "http://localhost"+endpoint, body) + return err +} + +// fcAPIOnce is the no-retry variant for non-idempotent state transitions +// (instance-start, pause/resume) where a retry after a lost response would +// hit a wrong-state error and mask the original success. +func fcAPIOnce(ctx context.Context, hc *http.Client, method, endpoint string, body []byte, successCodes ...int) error { + _, err := utils.DoAPIOnce(ctx, hc, method, "http://localhost"+endpoint, body, successCodes...) return err } @@ -106,7 +119,7 @@ func putMachineConfig(ctx context.Context, hc *http.Client, cfg fcMachineConfig) if err != nil { return fmt.Errorf("marshal machine-config: %w", err) } - return fcAPI(ctx, hc, http.MethodPut, "/machine-config", body) + return fcAPI(ctx, hc, "/machine-config", body) } func putBootSource(ctx context.Context, hc *http.Client, boot fcBootSource) error { @@ -114,7 +127,7 @@ func putBootSource(ctx context.Context, hc *http.Client, boot fcBootSource) erro if err != nil { return fmt.Errorf("marshal boot-source: %w", err) } - return fcAPI(ctx, hc, http.MethodPut, "/boot-source", body) + return fcAPI(ctx, hc, "/boot-source", body) } func putDrive(ctx context.Context, hc *http.Client, drive fcDrive) error { @@ -122,7 +135,7 @@ func putDrive(ctx context.Context, hc *http.Client, drive fcDrive) error { if err != nil { return fmt.Errorf("marshal drive: %w", err) } - return fcAPI(ctx, hc, http.MethodPut, "/drives/"+drive.DriveID, body) + return fcAPI(ctx, hc, "/drives/"+drive.DriveID, body) } func putBalloon(ctx context.Context, hc *http.Client, balloon fcBalloon) error { @@ -130,7 +143,7 @@ func putBalloon(ctx context.Context, hc *http.Client, balloon fcBalloon) error { if err != nil { return fmt.Errorf("marshal balloon: %w", err) } - return fcAPI(ctx, hc, http.MethodPut, "/balloon", body) + return fcAPI(ctx, hc, "/balloon", body) } func putNetworkInterface(ctx context.Context, hc *http.Client, iface fcNetworkInterface) error { @@ -138,11 +151,11 @@ func putNetworkInterface(ctx context.Context, hc *http.Client, iface fcNetworkIn if err != nil { return fmt.Errorf("marshal network-interface: %w", err) } - return fcAPI(ctx, hc, http.MethodPut, "/network-interfaces/"+iface.IfaceID, body) + return fcAPI(ctx, hc, "/network-interfaces/"+iface.IfaceID, body) } func putEntropy(ctx context.Context, hc *http.Client) error { - return fcAPI(ctx, hc, http.MethodPut, "/entropy", []byte("{}")) + return fcAPI(ctx, hc, "/entropy", []byte("{}")) } func instanceStart(ctx context.Context, hc *http.Client) error { @@ -150,7 +163,7 @@ func instanceStart(ctx context.Context, hc *http.Client) error { if err != nil { return fmt.Errorf("marshal action: %w", err) } - return fcAPI(ctx, hc, http.MethodPut, "/actions", body, http.StatusNoContent, http.StatusOK) + return fcAPIOnce(ctx, hc, http.MethodPut, "/actions", body, http.StatusNoContent, http.StatusOK) } func sendCtrlAltDel(ctx context.Context, hc *http.Client) error { @@ -158,25 +171,27 @@ func sendCtrlAltDel(ctx context.Context, hc *http.Client) error { if err != nil { return fmt.Errorf("marshal action: %w", err) } - return fcAPI(ctx, hc, http.MethodPut, "/actions", body) + return fcAPI(ctx, hc, "/actions", body) } -// pauseVM pauses a running FC instance via PATCH /vm. +// pauseVM pauses a running FC instance via PATCH /vm. Non-idempotent: a +// retry after a lost response would hit "already paused" and mask success. func pauseVM(ctx context.Context, hc *http.Client) error { body, err := json.Marshal(map[string]string{"state": vmStatePaused}) if err != nil { return fmt.Errorf("marshal pause request: %w", err) } - return fcAPI(ctx, hc, http.MethodPatch, "/vm", body) + return fcAPIOnce(ctx, hc, http.MethodPatch, "/vm", body) } -// resumeVM resumes a paused FC instance via PATCH /vm. +// resumeVM resumes a paused FC instance via PATCH /vm. Same non-idempotent +// shape as pauseVM. func resumeVM(ctx context.Context, hc *http.Client) error { body, err := json.Marshal(map[string]string{"state": vmStateResumed}) if err != nil { return fmt.Errorf("marshal resume request: %w", err) } - return fcAPI(ctx, hc, http.MethodPatch, "/vm", body) + return fcAPIOnce(ctx, hc, http.MethodPatch, "/vm", body) } // createSnapshotFC creates a full VM snapshot (vmstate + memory file) in destDir. From 22918e839e7a1cb17b4f6611f1669e65d38100ad Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 13:19:35 +0800 Subject: [PATCH 20/22] docs: trim verbose comments Cut WHAT-restating godoc and inline comments per SKILL.md (default to no comments; only keep WHY when non-obvious). Net -146 LOC across 15 files; no behavior changes. --- hypervisor/backend.go | 46 ++++++------------- hypervisor/cloudhypervisor/clone.go | 33 ++++--------- hypervisor/cloudhypervisor/direct.go | 20 ++------ hypervisor/cloudhypervisor/restore.go | 11 ++--- hypervisor/cloudhypervisor/snapshot.go | 17 ++----- hypervisor/cloudhypervisor/start.go | 17 ++----- hypervisor/firecracker/clone.go | 36 ++++----------- hypervisor/firecracker/create.go | 20 +++----- hypervisor/firecracker/direct.go | 22 ++------- hypervisor/firecracker/restore.go | 11 ++--- hypervisor/firecracker/snapshot.go | 17 ++----- hypervisor/firecracker/start.go | 17 ++----- hypervisor/snapshot.go | 38 ++++----------- ...snapshot_meta_test.go => snapshot_test.go} | 0 utils/map.go | 8 +--- utils/stream.go | 7 ++- 16 files changed, 87 insertions(+), 233 deletions(-) rename hypervisor/{snapshot_meta_test.go => snapshot_test.go} (100%) diff --git a/hypervisor/backend.go b/hypervisor/backend.go index 7fd6029..2d84bef 100644 --- a/hypervisor/backend.go +++ b/hypervisor/backend.go @@ -144,11 +144,9 @@ func (b *Backend) WithRunningVM(ctx context.Context, rec *VMRecord, fn func(pid return fn(pid) } -// WithPausedVM pauses a running VM, runs fn, then resumes it. Resume is -// guaranteed via defer: if fn returns an error the resume failure is logged -// and the inner error is propagated; if fn succeeds the resume error is -// promoted to the return value (snapshot data is captured but the VM is -// stuck). Both backends use this around snapshot capture. +// WithPausedVM pauses, runs fn, resumes. Resume on the success path is eager +// so its error promotes to the return; on fn-error the deferred resume only +// logs (the inner error wins). func (b *Backend) WithPausedVM(ctx context.Context, rec *VMRecord, pause, resume, fn func() error) error { return b.WithRunningVM(ctx, rec, func(_ int) error { if err := pause(); err != nil { @@ -665,21 +663,17 @@ func SocketPath(runDir string) string { return filepath.Join(runDir, APISocketNa func ConsoleSockPath(runDir string) string { return filepath.Join(runDir, ConsoleSockName) } -// LaunchSpec carries the per-call inputs to LaunchVMProcess. The shared -// BinaryName / SocketWaitTimeout come from the Backend's BackendConfig and -// don't repeat here. type LaunchSpec struct { Cmd *exec.Cmd PIDPath string SockPath string NetnsPath string // empty = host netns - OnFail func() // optional cleanup hook on any error path + OnFail func() // optional cleanup on any error path } -// LaunchVMProcess optionally enters a netns, starts spec.Cmd, writes the PID -// file, and waits for the API socket. On any post-Start error the process is -// killed and the PID file is removed before returning. The caller is -// responsible for reaping cmd via cmd.Wait() in a goroutine on success. +// LaunchVMProcess starts spec.Cmd and waits for the API socket. On any error +// after Start, the process is killed and the PID file is removed. Caller +// reaps cmd via cmd.Wait() in a goroutine on success. func (b *Backend) LaunchVMProcess(ctx context.Context, spec LaunchSpec) (pid int, err error) { started := false pidWritten := false @@ -725,8 +719,7 @@ func (b *Backend) LaunchVMProcess(ctx context.Context, spec LaunchSpec) (pid int return pid, nil } -// PrepareStagingDir creates a sibling staging directory, extracts the snapshot -// tar into it, and returns a cleanup function that removes the staging dir. +// PrepareStagingDir extracts the snapshot tar into a sibling staging dir. func PrepareStagingDir(runDir string, snapshot io.Reader) (stagingDir string, cleanup func(), err error) { stagingDir = runDir + ".restore-staging" if err = os.RemoveAll(stagingDir); err != nil { @@ -743,24 +736,18 @@ func PrepareStagingDir(runDir string, snapshot io.Reader) (stagingDir string, cl return stagingDir, cleanup, nil } -// RestoreSpec carries the backend-specific hooks for RestoreSequence. The -// shared template handles host-CPU validation, ResolveForRestore, staging, -// preflight, kill, and merge — backends supply only the differing pieces. type RestoreSpec struct { VMCfg *types.VMConfig Snapshot io.Reader - OverrideCheck func(rec *VMRecord, vmCfg *types.VMConfig) error // optional pre-kill resource validation + OverrideCheck func(rec *VMRecord, vmCfg *types.VMConfig) error Preflight func(stagingDir string, rec *VMRecord) error Kill func(ctx context.Context, vmID string, rec *VMRecord) error - Wrap func(rec *VMRecord, fn func() error) error // optional disk lock wrapping merge+afterExtract - BeforeMerge func(rec *VMRecord) error // optional pre-merge hook (e.g. FC removes stale COW) + Wrap func(rec *VMRecord, fn func() error) error // optional disk lock around merge+afterExtract + BeforeMerge func(rec *VMRecord) error // e.g. FC removes stale COW AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) } -// DirectRestoreSpec mirrors RestoreSpec but takes a local source directory -// rather than a streaming tar. Populate replaces the staging+merge step: -// backends decide how to copy the snapshot files into rec.RunDir (typically -// clean old artifacts, then per-file hardlink/reflink/copy). +// DirectRestoreSpec is RestoreSpec for a local srcDir rather than a tar; Populate replaces staging+merge. type DirectRestoreSpec struct { VMCfg *types.VMConfig SrcDir string @@ -772,9 +759,7 @@ type DirectRestoreSpec struct { AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) } -// RestoreSequence runs the shared restore flow used by both backends: -// validate host CPU, resolve VM, stage tar, preflight, kill, merge, then -// hand off to AfterExtract for backend-specific resume. +// RestoreSequence: validate CPU → resolve → stage → preflight → kill → merge → AfterExtract. func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec RestoreSpec) (*types.VM, error) { if err := ValidateHostCPU(spec.VMCfg.CPU); err != nil { return nil, err @@ -827,10 +812,7 @@ func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec Restor return result, nil } -// DirectRestoreSequence runs the local-snapshot-dir variant of restore: the -// caller supplies a srcDir already on disk. Same outage-safe ordering as -// RestoreSequence (preflight before kill, optional disk lock around populate -// + afterExtract). +// DirectRestoreSequence: like RestoreSequence but Populate replaces staging+merge. func (b *Backend) DirectRestoreSequence(ctx context.Context, vmRef string, spec DirectRestoreSpec) (*types.VM, error) { if err := ValidateHostCPU(spec.VMCfg.CPU); err != nil { return nil, err diff --git a/hypervisor/cloudhypervisor/clone.go b/hypervisor/cloudhypervisor/clone.go index 4410ee1..1d7c70d 100644 --- a/hypervisor/cloudhypervisor/clone.go +++ b/hypervisor/cloudhypervisor/clone.go @@ -32,12 +32,10 @@ type cloneResumeOpts struct { onDemand bool } -// Clone creates a new VM from a snapshot tar stream. func (ch *CloudHypervisor) Clone(ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, snapshotConfig *types.SnapshotConfig, snapshot io.Reader) (*types.VM, error) { return ch.CloneFromStream(ctx, vmID, vmCfg, networkConfigs, snapshotConfig, snapshot, ch.cloneAfterExtract) } -// cloneAfterExtract resumes from snapshot data already placed in runDir. func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, runDir, logDir string, now time.Time) (*types.VM, error) { logger := log.WithFunc("cloudhypervisor.Clone") @@ -65,7 +63,7 @@ func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, v } updateDataDiskPaths(storageConfigs, runDir) - // Snapshot may omit cidata if taken after restart. + // Snapshot may omit cidata if taken post-restart. hadCidataInSnapshot := updateCloneCidataPath(storageConfigs, directBoot, ch.conf.CidataPath(vmID)) if err = hypervisor.VerifyBaseFiles(storageConfigs, bootCfg); err != nil { @@ -79,7 +77,6 @@ func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, v stateReplacements := buildStateReplacements(chCfg, storageConfigs) - // Regenerate cidata for the clone's identity and network config. storageConfigs, err = ch.ensureCloneCidata(vmID, vmCfg, networkConfigs, storageConfigs, directBoot) if err != nil { return nil, err @@ -88,7 +85,6 @@ func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, v return nil, fmt.Errorf("validate post-cidata storage: %w", vErr) } - // If the snapshot lacked cidata, patch only snapshot disks and hotplug cidata later. patchStorageConfigs := restorePatchStorageConfigs(storageConfigs, directBoot, vmCfg.Windows, hadCidataInSnapshot) consoleSock := hypervisor.ConsoleSockPath(runDir) @@ -105,13 +101,11 @@ func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, v return nil, fmt.Errorf("patch CH config: %w", err) } - // Keep state.json disk paths readable after cloning. stateJSONPath := filepath.Join(runDir, "state.json") if err = patchStateJSON(stateJSONPath, stateReplacements); err != nil { return nil, fmt.Errorf("patch state.json: %w", err) } - // Refresh direct-boot cmdline for later restarts. if directBoot && bootCfg != nil { dns, dnsErr := ch.conf.DNSServers() if dnsErr != nil { @@ -209,7 +203,7 @@ func (ch *CloudHypervisor) ensureCloneCidata(vmID string, vmCfg *types.VMConfig, return nil, fmt.Errorf("generate cidata: %w", err) } cidataPath := ch.conf.CidataPath(vmID) - // Keep cidata in the record for later cold boots even if the snapshot omitted it. + // Keep cidata in record for later cold boots even if snapshot omitted it. if !slices.ContainsFunc(storageConfigs, hasCidataRole) { storageConfigs = append(storageConfigs, &types.StorageConfig{ Path: cidataPath, @@ -262,14 +256,12 @@ func updateCloneCidataPath(storageConfigs []*types.StorageConfig, directBoot boo return hadCidataInSnapshot } -// hasCidataRole is a slices.ContainsFunc predicate. func hasCidataRole(sc *types.StorageConfig) bool { return sc.Role == types.StorageRoleCidata } -// restorePatchStorageConfigs strips the cidata entry ensureCloneCidata -// appended to a no-cidata snapshot, so patchCHConfig matches chCfg.Disks -// count. Cidata is hot-plugged later. +// restorePatchStorageConfigs strips ensureCloneCidata's appended cidata when +// the snapshot lacked one, so patchCHConfig matches chCfg.Disks; cidata gets hot-plugged. func restorePatchStorageConfigs(storageConfigs []*types.StorageConfig, directBoot, windows, hadCidataInSnapshot bool) []*types.StorageConfig { if directBoot || windows || hadCidataInSnapshot { return storageConfigs @@ -283,10 +275,7 @@ func restorePatchStorageConfigs(storageConfigs []*types.StorageConfig, directBoo return out } -// updateDataDiskPaths rewrites every Role==Data entry's Path to live under -// newRunDir, derived from the disk's serial. Used by clone after sidecar -// load: the sidecar carries the source VM's runDir, but the clone's data -// disk files have been reflinked into the clone's runDir. +// updateDataDiskPaths rewrites Role==Data paths to clone runDir; sidecar carries source paths. func updateDataDiskPaths(configs []*types.StorageConfig, newRunDir string) { for _, sc := range configs { if sc.Role == types.StorageRoleData { @@ -324,10 +313,9 @@ func buildCmdline(storageConfigs []*types.StorageConfig, networkConfigs []*types return cmdline.String() } -// buildStateReplacements maps source disk paths to clone paths for -// state.json patching. Iterates min(chCfg.Disks, storageConfigs) so an -// appended cidata in storageConfigs doesn't desync the prefix. -// MACs are not patched here; NIC hot-swap rewrites the virtio-net device. +// buildStateReplacements maps source disk paths to clone paths for state.json +// patching. Slices to min length so an appended cidata in storageConfigs +// doesn't desync the prefix; MACs are handled separately by NIC hot-swap. func buildStateReplacements(chCfg *chVMConfig, storageConfigs []*types.StorageConfig) map[string]string { n := min(len(chCfg.Disks), len(storageConfigs)) m := make(map[string]string, n) @@ -339,9 +327,8 @@ func buildStateReplacements(chCfg *chVMConfig, storageConfigs []*types.StorageCo return m } -// hotSwapNets removes old NICs (carrying stale MAC from snapshot binary state) -// and adds new ones with the correct MAC/TAP configuration. -// Must be called while VM is paused (between vm.restore and vm.resume). +// hotSwapNets removes NICs with stale MAC (from snapshot binary state) and +// adds fresh ones. Must run between vm.restore and vm.resume (VM paused). func hotSwapNets(ctx context.Context, hc *http.Client, oldNets []chNet, networkConfigs []*types.NetworkConfig) error { logger := log.WithFunc("cloudhypervisor.hotSwapNets") for _, oldNet := range oldNets { diff --git a/hypervisor/cloudhypervisor/direct.go b/hypervisor/cloudhypervisor/direct.go index 8dd7344..999af0b 100644 --- a/hypervisor/cloudhypervisor/direct.go +++ b/hypervisor/cloudhypervisor/direct.go @@ -10,16 +10,12 @@ import ( "github.com/cocoonstack/cocoon/types" ) -// DirectClone creates a new VM from a local snapshot directory. -// Files are handled per-type: hardlink for memory-range-*, reflink/copy for -// the COW disk, plain copy for small metadata, and cidata is regenerated. +// DirectClone clones from a local snapshot dir. Per-type: hardlink memory-range-*, +// reflink/copy COW, plain copy metadata; cidata is regenerated. func (ch *CloudHypervisor) DirectClone(ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, snapshotConfig *types.SnapshotConfig, srcDir string) (*types.VM, error) { return ch.DirectCloneBase(ctx, vmID, vmCfg, networkConfigs, snapshotConfig, srcDir, cloneSnapshotFiles, ch.cloneAfterExtract) } -// DirectRestore reverts a running VM using a local snapshot directory. -// Files are handled per-type: hardlink for memory-range-*, reflink/copy for -// the COW disk, plain copy for small metadata. func (ch *CloudHypervisor) DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir string) (*types.VM, error) { return ch.DirectRestoreSequence(ctx, vmRef, hypervisor.DirectRestoreSpec{ VMCfg: vmCfg, @@ -34,8 +30,6 @@ func (ch *CloudHypervisor) DirectRestore(ctx context.Context, vmRef string, vmCf }) } -// populateRunDirFromSrc cleans rec.RunDir of old snapshot files then copies -// new ones from srcDir using the per-backend cloneSnapshotFiles strategy. func populateRunDirFromSrc(rec *hypervisor.VMRecord, srcDir string) error { if err := cleanSnapshotFiles(rec.RunDir); err != nil { return fmt.Errorf("clean old snapshot files: %w", err) @@ -46,9 +40,6 @@ func populateRunDirFromSrc(rec *hypervisor.VMRecord, srcDir string) error { return nil } -// cloneSnapshotFiles copies snapshot files from srcDir to dstDir using -// per-file strategies: hardlink/symlink for memory-range-*, reflink/sparse -// for COW disks, plain copy for metadata (config.json, state.json). func cloneSnapshotFiles(dstDir, srcDir string) error { chCfg, _, err := parseCHConfig(filepath.Join(srcDir, "config.json")) if err != nil { @@ -67,10 +58,8 @@ func cloneSnapshotFiles(dstDir, srcDir string) error { }) } -// cleanSnapshotFiles removes snapshot-specific files from runDir. -// COW files have arbitrary names and are overwritten by cloneSnapshotFiles; -// data disks (data-.raw) and the cocoon.json sidecar are listed -// explicitly so stale survivors from a previous incarnation don't linger. +// cleanSnapshotFiles enumerates by name so stale data-*.raw and cocoon.json +// from a previous incarnation don't linger; COW files are overwritten anyway. func cleanSnapshotFiles(runDir string) error { return hypervisor.CleanSnapshotFiles(runDir, func(name string) bool { switch { @@ -87,7 +76,6 @@ func cleanSnapshotFiles(runDir string) error { }) } -// identifyCOWFiles returns the set of basenames of writable (COW) disk files. func identifyCOWFiles(cfg *chVMConfig) map[string]bool { m := make(map[string]bool) for _, d := range cfg.Disks { diff --git a/hypervisor/cloudhypervisor/restore.go b/hypervisor/cloudhypervisor/restore.go index 2e918e0..5f5310b 100644 --- a/hypervisor/cloudhypervisor/restore.go +++ b/hypervisor/cloudhypervisor/restore.go @@ -13,7 +13,6 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Restore stages a snapshot tar before replacing the running VM state. func (ch *CloudHypervisor) Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader) (*types.VM, error) { return ch.RestoreSequence(ctx, vmRef, hypervisor.RestoreSpec{ VMCfg: vmCfg, @@ -27,9 +26,8 @@ func (ch *CloudHypervisor) Restore(ctx context.Context, vmRef string, vmCfg *typ }) } -// preflightRestore loads the sidecar from srcDir, runs structural validation, -// then asserts the snapshot's role sequence is a valid prefix of the live -// record (cidata-only suffix on rec is the one allowed extension). +// preflightRestore validates the sidecar and asserts the snapshot's role +// sequence is a valid prefix of rec (cidata-only suffix is the one extension). func (ch *CloudHypervisor) preflightRestore(srcDir string, rec *hypervisor.VMRecord) error { meta, err := hypervisor.LoadAndValidateMeta(srcDir, ch.conf.RootDir, ch.conf.Config.RunDir) if err != nil { @@ -41,7 +39,6 @@ func (ch *CloudHypervisor) preflightRestore(srcDir string, rec *hypervisor.VMRec return hypervisor.ValidateRoleSequence(meta.StorageConfigs, rec.StorageConfigs) } -// killForRestore stops the running CH process and cleans up runtime files. func (ch *CloudHypervisor) killForRestore(ctx context.Context, vmID string, rec *hypervisor.VMRecord) error { sockPath := hypervisor.SocketPath(rec.RunDir) return ch.KillForRestore(ctx, vmID, rec, func(pid int) error { @@ -49,7 +46,6 @@ func (ch *CloudHypervisor) killForRestore(ctx context.Context, vmID string, rec }, runtimeFiles) } -// restoreAfterExtract resumes from snapshot data already placed in runDir. func (ch *CloudHypervisor) restoreAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *hypervisor.VMRecord, directBoot bool, cowPath string) (_ *types.VM, err error) { logger := log.WithFunc("cloudhypervisor.Restore") @@ -60,8 +56,7 @@ func (ch *CloudHypervisor) restoreAfterExtract(ctx context.Context, vmID string, }() chConfigPath := filepath.Join(rec.RunDir, "config.json") - // Use sidecar length; rec may have trailing cidata that the snapshot - // lacks (cloudimg post-first-boot), prefix-slice trims it. + // rec may have trailing cidata absent from the snapshot (cloudimg post-first-boot); slice to sidecar length. meta, metaErr := hypervisor.LoadAndValidateMeta(rec.RunDir, ch.conf.RootDir, ch.conf.Config.RunDir) if metaErr != nil { return nil, fmt.Errorf("load snapshot meta: %w", metaErr) diff --git a/hypervisor/cloudhypervisor/snapshot.go b/hypervisor/cloudhypervisor/snapshot.go index 764db98..f5845a6 100644 --- a/hypervisor/cloudhypervisor/snapshot.go +++ b/hypervisor/cloudhypervisor/snapshot.go @@ -12,9 +12,7 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Snapshot pauses the VM, captures its full state (CPU, memory, devices via CH -// snapshot API, plus the COW disk via sparse copy), resumes the VM, and returns -// a streaming tar.gz reader of the snapshot directory. +// Snapshot pauses, captures CH state+COW, resumes, and streams the result. func (ch *CloudHypervisor) Snapshot(ctx context.Context, ref string) (*types.SnapshotConfig, io.ReadCloser, error) { vmID, err := ch.ResolveRef(ctx, ref) if err != nil { @@ -32,7 +30,6 @@ func (ch *CloudHypervisor) Snapshot(ctx context.Context, ref string) (*types.Sna sockPath := hypervisor.SocketPath(rec.RunDir) hc := utils.NewSocketHTTPClient(sockPath) - // Determine COW file path and name inside the tar archive. directBoot := isDirectBoot(rec.BootConfig) cowPath := ch.cowPath(vmID, directBoot) cowName := "overlay.qcow2" @@ -40,7 +37,6 @@ func (ch *CloudHypervisor) Snapshot(ctx context.Context, ref string) (*types.Sna cowName = "cow.raw" } - // Create a temporary directory for the snapshot data. tmpDir, err := os.MkdirTemp(ch.conf.VMRunDir(vmID), "snapshot-") if err != nil { return nil, nil, fmt.Errorf("create temp dir: %w", err) @@ -61,8 +57,7 @@ func (ch *CloudHypervisor) Snapshot(ctx context.Context, ref string) (*types.Sna return nil, nil, fmt.Errorf("snapshot VM %s: %w", vmID, err) } - // For cloudimg VMs, include cidata.img (per-VM cloud-init disk). - // cidata is read-only and static, so it can be copied outside the pause window. + // cidata is RO + static — safe to copy outside the pause window. if !directBoot && !rec.Config.Windows { cidataSrc := ch.conf.CidataPath(vmID) if _, statErr := os.Stat(cidataSrc); statErr == nil { @@ -78,7 +73,6 @@ func (ch *CloudHypervisor) Snapshot(ctx context.Context, ref string) (*types.Sna return nil, nil, fmt.Errorf("write snapshot metadata: %w", metaErr) } - // Generate snapshot ID and record it on the VM atomically. snapID, recErr := ch.RecordSnapshot(ctx, vmID, tmpDir) if recErr != nil { return nil, nil, recErr @@ -87,10 +81,9 @@ func (ch *CloudHypervisor) Snapshot(ctx context.Context, ref string) (*types.Sna return ch.BuildSnapshotConfig(snapID, &rec), utils.TarDirStreamWithRemove(tmpDir), nil } -// writeSnapshotMeta builds the cocoon.json sidecar by mirroring the -// snapshot's config.json shape. Using activeDisks(rec) would diverge for a -// cloudimg VM snapshotted post-FirstBooted but pre-restart: CH still holds -// cidata, but activeDisks would skip it. +// writeSnapshotMeta mirrors config.json's disk shape. activeDisks(rec) would +// diverge for cloudimg post-FirstBooted but pre-restart: CH still holds cidata, +// activeDisks would skip it. func writeSnapshotMeta(tmpDir string, rec *hypervisor.VMRecord) error { chCfg, _, err := parseCHConfig(filepath.Join(tmpDir, "config.json")) if err != nil { diff --git a/hypervisor/cloudhypervisor/start.go b/hypervisor/cloudhypervisor/start.go index 2af7f05..4404616 100644 --- a/hypervisor/cloudhypervisor/start.go +++ b/hypervisor/cloudhypervisor/start.go @@ -14,8 +14,6 @@ import ( "github.com/cocoonstack/cocoon/types" ) -// Start launches the Cloud Hypervisor process for each VM ref. -// Returns the IDs that were successfully started. func (ch *CloudHypervisor) Start(ctx context.Context, refs []string) ([]string, error) { return ch.StartAll(ctx, refs, ch.startOne) } @@ -48,10 +46,6 @@ func (ch *CloudHypervisor) startOne(ctx context.Context, id string) error { return nil } -// launchProcess starts the cloud-hypervisor binary with the given args, -// writes the PID file, waits for the API socket to be ready, then releases -// the process handle so CH lives as an independent OS process past the -// lifetime of this binary. func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VMRecord, socketPath string, args []string, withNetwork bool) (int, error) { processLog := filepath.Join(rec.LogDir, "cloud-hypervisor.log") logFile, err := os.Create(processLog) //nolint:gosec @@ -61,17 +55,15 @@ func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VM defer logFile.Close() //nolint:errcheck } - // shell out because launching the Cloud Hypervisor process (external binary is the authoritative implementation). cmd := exec.Command(ch.conf.CHBinary, args...) //nolint:gosec - // Detach from the parent process group so CH survives if this process exits. + // Setpgid so CH survives if this process exits. cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} if logFile != nil { cmd.Stdout = logFile cmd.Stderr = logFile } - // CNI mode: TAP is inside a per-VM netns, switch before fork. - // Bridge mode: TAP is in host netns, no EnterNetns needed. + // CNI mode: enter per-VM netns before fork. Bridge mode: TAP is in host netns. netnsPath := "" if withNetwork && rec.NetworkConfigs[0].NetnsPath != "" { netnsPath = rec.NetworkConfigs[0].NetnsPath @@ -87,10 +79,7 @@ func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VM return 0, err } - // Reap the child process asynchronously to prevent zombies. - // In CLI mode init adopts the orphan, but in daemon mode the - // long-lived parent must wait() or the child becomes a zombie - // that blocks IsProcessAlive checks during stop/delete. + // Daemon mode: parent must wait() or zombie blocks IsProcessAlive on stop/delete. go cmd.Wait() //nolint:errcheck return pid, nil } diff --git a/hypervisor/firecracker/clone.go b/hypervisor/firecracker/clone.go index 97d3d75..30c263f 100644 --- a/hypervisor/firecracker/clone.go +++ b/hypervisor/firecracker/clone.go @@ -17,20 +17,16 @@ import ( const cloneBackupSuffix = ".cocoon-clone-backup" -// driveRedirect records a temporary symlink replacing a source drive path. type driveRedirect struct { symlinkPath string backupPath string createdDir bool } -// Clone creates a new VM from a snapshot tar stream via FC snapshot/load. func (fc *Firecracker) Clone(ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, snapshotConfig *types.SnapshotConfig, snapshot io.Reader) (*types.VM, error) { return fc.CloneFromStream(ctx, vmID, vmCfg, networkConfigs, snapshotConfig, snapshot, fc.cloneAfterExtract) } -// cloneAfterExtract contains all clone logic after snapshot data is in runDir. -// FC snapshots require the same directory layout — paths are absolute. func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, runDir, logDir string, now time.Time) (*types.VM, error) { logger := log.WithFunc("firecracker.Clone") @@ -39,7 +35,7 @@ func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg return nil, fmt.Errorf("load snapshot metadata: %w", err) } - // FC cannot update CPU/memory after snapshot/load. Reject overrides early. + // FC cannot update CPU/memory after snapshot/load. if meta.CPU > 0 && vmCfg.CPU != meta.CPU { return nil, fmt.Errorf("--cpu %d not supported: Firecracker cannot change CPU after snapshot/load (snapshot has %d)", vmCfg.CPU, meta.CPU) } @@ -47,14 +43,12 @@ func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg return nil, fmt.Errorf("--memory not supported: Firecracker cannot change memory after snapshot/load") } - // Move extracted COW to canonical path. cowPath := fc.conf.COWRawPath(vmID) snapshotCOW := filepath.Join(runDir, cowFileName) if renameErr := os.Rename(snapshotCOW, cowPath); renameErr != nil { return nil, fmt.Errorf("move COW to canonical path: %w", renameErr) } - // Rebuild storage: reuse RO layer paths from metadata, new COW path. storageConfigs, err := rebuildCloneStorage(meta, cowPath) if err != nil { return nil, err @@ -88,10 +82,9 @@ func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg bootCfg.Cmdline = buildCmdline(storageConfigs, networkConfigs, vmCfg.Name, dns) } - // FC snapshot/load requires drives at the same absolute paths as the source. - // RO layers are shared blobs (same path). Only the COW path changed. - // TODO: Replace symlink redirect with drive_overrides when FC PR #5774 - // (github.com/firecracker-microvm/firecracker/pull/5774) is merged. + // FC snapshot/load requires drives at source absolute paths; only COW path + // changed, so symlink-redirect the source path until drive_overrides + // (firecracker-microvm/firecracker#5774) lands. sockPath := hypervisor.SocketPath(runDir) withNetwork := len(networkConfigs) > 0 var pid int @@ -144,13 +137,10 @@ func (fc *Firecracker) restoreAndResumeClone( } }() - // Use network_overrides to provide clone's TAP devices during snapshot/load. - // FC re-creates network devices from vmstate — overrides replace the - // source TAP with the clone's TAP so FC opens the right device. + // network_overrides points FC at the clone's TAP instead of the source TAP + // (FC recreates devices from vmstate). The earlier symlink redirect makes + // FC open the clone's COW; held fds survive its cleanup. netOverrides := buildNetworkOverrides(networkConfigs) - // FC opens drive files during snapshot/load and holds the fds. - // The symlink redirect ensures FC opens the clone's COW, not the source's. - // No drive reconfiguration needed — fds survive symlink cleanup. if err = loadSnapshotFC(ctx, sockPath, runDir, netOverrides); err != nil { return fmt.Errorf("snapshot/load: %w", err) } @@ -161,21 +151,15 @@ func (fc *Firecracker) restoreAndResumeClone( return nil } -// rebuildCloneStorage produces the clone's storage list in the same order as -// the snapshot's, rewriting paths as needed: -// - Layer disks keep their absolute paths (shared image blobs) -// - COW moves to the clone's cowPath -// - Data disks move to the clone's runDir under data-.raw -// -// FC has no cloudimg path, so Role==Cidata in meta is unexpected; treat it -// as an error rather than silently dropping or copying it. +// rebuildCloneStorage rewrites paths per role: Layer keeps source path (shared +// blob), COW → clone cowPath, Data → clone runDir. Cidata is rejected (FC has +// no cloudimg path). func rebuildCloneStorage(meta *hypervisor.SnapshotMeta, cowPath string) ([]*types.StorageConfig, error) { runDir := filepath.Dir(cowPath) configs := hypervisor.CloneStorageConfigs(meta.StorageConfigs) for i, sc := range configs { switch sc.Role { case types.StorageRoleLayer: - // keep Path as-is case types.StorageRoleCOW: sc.Path = cowPath case types.StorageRoleData: diff --git a/hypervisor/firecracker/create.go b/hypervisor/firecracker/create.go index ba2a908..54d775f 100644 --- a/hypervisor/firecracker/create.go +++ b/hypervisor/firecracker/create.go @@ -18,7 +18,6 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Create reserves a VM record, prepares disks, and leaves the VM in Created state. func (fc *Firecracker) Create(ctx context.Context, id string, vmCfg *types.VMConfig, storageConfigs []*types.StorageConfig, networkConfigs []*types.NetworkConfig, bootCfg *types.BootConfig) (_ *types.VM, err error) { if err = hypervisor.ValidateHostCPU(vmCfg.CPU); err != nil { return nil, err @@ -70,7 +69,6 @@ func (fc *Firecracker) Create(ctx context.Context, id string, vmCfg *types.VMCon return info, nil } -// prepareOCI creates the raw COW disk and final kernel cmdline. func (fc *Firecracker) prepareOCI(ctx context.Context, vmID string, vmCfg *types.VMConfig, storageConfigs []*types.StorageConfig, networkConfigs []*types.NetworkConfig, boot *types.BootConfig) ([]*types.StorageConfig, error) { storageConfigs, err := hypervisor.PrepareOCICOW(ctx, fc.conf.COWRawPath(vmID), vmCfg.Storage, storageConfigs) if err != nil { @@ -99,7 +97,7 @@ func (fc *Firecracker) prepareOCI(ctx context.Context, vmID string, vmCfg *types return storageConfigs, nil } -// EnsureVmlinux returns an uncompressed ELF kernel path. +// EnsureVmlinux decompresses kernelPath if needed and returns the ELF path. func EnsureVmlinux(kernelPath string) (string, error) { elfMagic := []byte{0x7f, 'E', 'L', 'F'} @@ -138,7 +136,6 @@ func EnsureVmlinux(kernelPath string) (string, error) { return vmlinuxPath, nil } -// decompressKernel scans a bzImage for supported compressed payloads. func decompressKernel(data []byte) ([]byte, error) { type kernelCodec struct { name string @@ -178,8 +175,8 @@ func decompressZstd(data []byte) ([]byte, error) { return nil, fmt.Errorf("new zstd reader: %w", err) } defer dec.Close() - // DecodeAll may error on trailing data after the first frame, but any - // prefix already written to out is valid — caller validates via ELF magic. + // DecodeAll may error on trailing data after the first frame; any prefix + // already in out is valid — caller validates via ELF magic. out, err := dec.DecodeAll(data, nil) if len(out) == 0 { return nil, fmt.Errorf("zstd decode: %w", err) @@ -202,12 +199,9 @@ func buildCmdline(storageConfigs []*types.StorageConfig, networkConfigs []*types cowDev := devPath(len(layerDevs)) var cmdline strings.Builder - // FC serial console is ttyS0 (not hvc0 like CH's virtio-console). - // reboot=k: FC has no ACPI PM — use i8042 keyboard controller reset so - // guest reboot/shutdown triggers FC process exit instead of hanging. - // pci=off: FC has no PCI bus, skip probing (~50-100ms saved) - // i8042.noaux: no PS/2 mouse, skip auxiliary device probe timeout - // 8250.nr_uarts=1: FC exposes one serial port, skip probing for 3 others + // FC quirks: ttyS0 (not hvc0), reboot=k (no ACPI PM, use i8042 reset so + // shutdown exits FC instead of hanging), pci=off + i8042.noaux + + // 8250.nr_uarts=1 skip probes for absent hardware. fmt.Fprintf(&cmdline, "console=ttyS0 reboot=k loglevel=3 pci=off i8042.noaux 8250.nr_uarts=1 boot=cocoon-overlay cocoon.layers=%s cocoon.cow=%s clocksource=kvm-clock rw", strings.Join(layerDevs, ","), cowDev, @@ -221,7 +215,7 @@ func buildCmdline(storageConfigs []*types.StorageConfig, networkConfigs []*types return cmdline.String() } -// Follows Linux naming: vda..vdz, vdaa..vdaz, vdba..vdbz, ... +// devPath maps idx to vda..vdz, vdaa..vdaz, vdba..vdbz, ... func devPath(idx int) string { const letters = 26 if idx < letters { diff --git a/hypervisor/firecracker/direct.go b/hypervisor/firecracker/direct.go index 23d0531..0782b3f 100644 --- a/hypervisor/firecracker/direct.go +++ b/hypervisor/firecracker/direct.go @@ -9,16 +9,12 @@ import ( "github.com/cocoonstack/cocoon/types" ) -// DirectClone creates a new VM from a local snapshot directory. -// Files are handled per-type: hardlink for mem, reflink/copy for -// the COW disk, plain copy for small metadata (vmstate). +// DirectClone clones from a local snapshot dir. Per-type: hardlink mem, +// reflink/copy COW, plain copy metadata. func (fc *Firecracker) DirectClone(ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, snapshotConfig *types.SnapshotConfig, srcDir string) (*types.VM, error) { return fc.DirectCloneBase(ctx, vmID, vmCfg, networkConfigs, snapshotConfig, srcDir, cloneSnapshotFiles, fc.cloneAfterExtract) } -// DirectRestore reverts a running VM using a local snapshot directory. -// Files are handled per-type: hardlink for mem, reflink/copy for -// the COW disk, plain copy for small metadata (vmstate). func (fc *Firecracker) DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir string) (*types.VM, error) { return fc.DirectRestoreSequence(ctx, vmRef, hypervisor.DirectRestoreSpec{ VMCfg: vmCfg, @@ -26,9 +22,8 @@ func (fc *Firecracker) DirectRestore(ctx context.Context, vmRef string, vmCfg *t OverrideCheck: validateRestoreOverrides, Preflight: fc.preflightRestore, Kill: fc.killForRestore, - // Lock every writable disk so recoverStaleBackup heals stale - // data-*.raw.cocoon-clone-backup before restore overwrites them; - // otherwise a future clone would rename the backup over restored data. + // Lock writable disks so recoverStaleBackup heals stale data-*.raw.cocoon-clone-backup + // before restore overwrites them; otherwise a future clone renames backup over restored data. Wrap: func(rec *hypervisor.VMRecord, inner func() error) error { return withSourceWritableDisksLocked(rec.StorageConfigs, inner) }, @@ -39,8 +34,6 @@ func (fc *Firecracker) DirectRestore(ctx context.Context, vmRef string, vmCfg *t }) } -// populateRunDirFromSrc cleans rec.RunDir of old snapshot files then copies -// new ones from srcDir using the per-backend cloneSnapshotFiles strategy. func populateRunDirFromSrc(rec *hypervisor.VMRecord, srcDir string) error { if err := cleanSnapshotFiles(rec.RunDir); err != nil { return fmt.Errorf("clean old snapshot files: %w", err) @@ -51,9 +44,6 @@ func populateRunDirFromSrc(rec *hypervisor.VMRecord, srcDir string) error { return nil } -// cloneSnapshotFiles copies snapshot files from srcDir to dstDir using -// per-file strategies: hardlink/symlink for mem, reflink/sparse for COW, -// plain copy for metadata (vmstate). func cloneSnapshotFiles(dstDir, srcDir string) error { return hypervisor.CloneSnapshotFiles(dstDir, srcDir, func(name string) hypervisor.SnapshotFileKind { switch { @@ -67,9 +57,7 @@ func cloneSnapshotFiles(dstDir, srcDir string) error { }) } -// cleanSnapshotFiles removes snapshot-specific files from runDir. -// Data disks (data-.raw) are listed explicitly so stale survivors -// don't linger across restore. +// cleanSnapshotFiles enumerates by name so stale data-*.raw don't linger across restore. func cleanSnapshotFiles(runDir string) error { return hypervisor.CleanSnapshotFiles(runDir, func(name string) bool { switch name { diff --git a/hypervisor/firecracker/restore.go b/hypervisor/firecracker/restore.go index dc7d885..d725d24 100644 --- a/hypervisor/firecracker/restore.go +++ b/hypervisor/firecracker/restore.go @@ -14,7 +14,6 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Restore stages a snapshot tar before replacing the running VM state. func (fc *Firecracker) Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader) (*types.VM, error) { return fc.RestoreSequence(ctx, vmRef, hypervisor.RestoreSpec{ VMCfg: vmCfg, @@ -22,9 +21,8 @@ func (fc *Firecracker) Restore(ctx context.Context, vmRef string, vmCfg *types.V OverrideCheck: validateRestoreOverrides, Preflight: fc.preflightRestore, Kill: fc.killForRestore, - // Lock every writable disk so recoverStaleBackup heals stale - // data-*.raw.cocoon-clone-backup before restore overwrites them; - // otherwise a future clone would rename the backup over restored data. + // Lock writable disks so recoverStaleBackup heals stale data-*.raw.cocoon-clone-backup + // before restore overwrites them; otherwise a future clone renames backup over restored data. Wrap: func(rec *hypervisor.VMRecord, inner func() error) error { return withSourceWritableDisksLocked(rec.StorageConfigs, inner) }, @@ -38,7 +36,6 @@ func (fc *Firecracker) Restore(ctx context.Context, vmRef string, vmCfg *types.V }) } -// killForRestore stops the running FC process and cleans up runtime files. func (fc *Firecracker) killForRestore(ctx context.Context, vmID string, rec *hypervisor.VMRecord) error { return fc.KillForRestore(ctx, vmID, rec, func(pid int) error { sockPath := hypervisor.SocketPath(rec.RunDir) @@ -46,7 +43,6 @@ func (fc *Firecracker) killForRestore(ctx context.Context, vmID string, rec *hyp }, runtimeFiles) } -// restoreAfterExtract resumes from snapshot data already placed in runDir. func (fc *Firecracker) restoreAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *hypervisor.VMRecord, cowPath string) (_ *types.VM, err error) { logger := log.WithFunc("firecracker.Restore") @@ -85,7 +81,6 @@ func (fc *Firecracker) restoreAfterExtract(ctx context.Context, vmID string, vmC } }() - // Restore keeps the same VM, so drives and TAPs stay unchanged. if err = loadSnapshotFC(ctx, sockPath, rec.RunDir, nil); err != nil { return nil, fmt.Errorf("snapshot/load: %w", err) } @@ -99,7 +94,7 @@ func (fc *Firecracker) restoreAfterExtract(ctx context.Context, vmID string, vmC return fc.FinalizeRestore(ctx, vmID, vmCfg, rec, pid) } -// validateRestoreOverrides rejects CPU or memory changes FC cannot apply after load. +// validateRestoreOverrides rejects CPU/memory changes FC cannot apply post-load. func validateRestoreOverrides(rec *hypervisor.VMRecord, vmCfg *types.VMConfig) error { if vmCfg.CPU != rec.Config.CPU { return fmt.Errorf("--cpu %d not supported: Firecracker cannot change CPU after snapshot/load (VM has %d)", vmCfg.CPU, rec.Config.CPU) diff --git a/hypervisor/firecracker/snapshot.go b/hypervisor/firecracker/snapshot.go index d69a190..fba68de 100644 --- a/hypervisor/firecracker/snapshot.go +++ b/hypervisor/firecracker/snapshot.go @@ -17,9 +17,7 @@ const ( snapshotMemFile = "mem" ) -// Snapshot pauses the VM, captures its full state (CPU/device state via FC -// snapshot API + memory file + COW disk via reflink copy), resumes the VM, -// and returns a streaming tar.gz reader of the snapshot directory. +// Snapshot pauses, captures vmstate+mem+COW, resumes, and streams the result. func (fc *Firecracker) Snapshot(ctx context.Context, ref string) (*types.SnapshotConfig, io.ReadCloser, error) { vmID, err := fc.ResolveRef(ctx, ref) if err != nil { @@ -45,9 +43,8 @@ func (fc *Firecracker) Snapshot(ctx context.Context, ref string) (*types.Snapsho pause := func() error { return pauseVM(ctx, hc) } resume := func() error { return resumeVM(context.WithoutCancel(ctx), hc) } - // Lock every writable disk so a concurrent clone's rename+symlink - // redirect can't race this snapshot's reflink. Dictionary order keeps - // concurrent callers deadlock-free. + // Lock writable disks: a concurrent clone's rename+symlink redirect would + // otherwise race this snapshot's reflink. Dictionary order avoids deadlock. if err := withSourceWritableDisksLocked(rec.StorageConfigs, func() error { return fc.WithPausedVM(ctx, &rec, pause, resume, func() error { if err := createSnapshotFC(ctx, sockPath, tmpDir); err != nil { @@ -63,9 +60,6 @@ func (fc *Firecracker) Snapshot(ctx context.Context, ref string) (*types.Snapsho return nil, nil, fmt.Errorf("snapshot VM %s: %w", vmID, err) } - // Save snapshot metadata (absolute paths) so clones can reconstruct - // storage/boot config without depending on live VM records. - // FC snapshots require the same directory layout — paths are stored as-is. if metaErr := hypervisor.SaveSnapshotMeta(tmpDir, buildSnapshotMeta(&rec)); metaErr != nil { os.RemoveAll(tmpDir) //nolint:errcheck,gosec return nil, nil, fmt.Errorf("save snapshot metadata: %w", metaErr) @@ -79,9 +73,8 @@ func (fc *Firecracker) Snapshot(ctx context.Context, ref string) (*types.Snapsho return fc.BuildSnapshotConfig(snapID, &rec), utils.TarDirStreamWithRemove(tmpDir), nil } -// buildSnapshotMeta builds the FC sidecar from a live VM record. The kernel -// path is rewritten to vmlinuz so clones receive the portable artifact, -// not the FC-specific vmlinux cache. +// buildSnapshotMeta rewrites kernel path to vmlinuz so clones get the portable +// artifact instead of the FC-specific vmlinux cache. func buildSnapshotMeta(rec *hypervisor.VMRecord) *hypervisor.SnapshotMeta { meta := &hypervisor.SnapshotMeta{ CPU: rec.Config.CPU, diff --git a/hypervisor/firecracker/start.go b/hypervisor/firecracker/start.go index 2135bd4..3e0dbdc 100644 --- a/hypervisor/firecracker/start.go +++ b/hypervisor/firecracker/start.go @@ -18,8 +18,7 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Start launches the Firecracker process for each VM ref, configures it -// via the REST API, and issues InstanceStart. +// Start launches FC, configures via REST, then InstanceStart. func (fc *Firecracker) Start(ctx context.Context, refs []string) ([]string, error) { return fc.StartAll(ctx, refs, fc.startOne) } @@ -46,7 +45,6 @@ func (fc *Firecracker) startOne(ctx context.Context, id string) error { return fmt.Errorf("launch VM: %w", err) } - // Configure VM via REST API and start the instance. if err := fc.configureVM(ctx, utils.NewSocketHTTPClient(sockPath), rec); err != nil { fc.AbortLaunch(ctx, pid, sockPath, rec.RunDir, runtimeFiles) fc.MarkError(ctx, id) @@ -55,8 +53,7 @@ func (fc *Firecracker) startOne(ctx context.Context, id string) error { return nil } -// configureVM sends the pre-boot configuration to FC via REST API, -// then issues InstanceStart to boot the guest. +// configureVM sends pre-boot config via REST then InstanceStart. func (fc *Firecracker) configureVM(ctx context.Context, hc *http.Client, rec *hypervisor.VMRecord) error { memMiB := int(rec.Config.Memory >> 20) //nolint:mnd hugePages := hugePagesNone @@ -113,8 +110,7 @@ func (fc *Firecracker) configureVM(ctx context.Context, hc *http.Client, rec *hy } } - // Balloon: 25% of memory returned, only when memory >= MinBalloonMemory. - // Matches Cloud Hypervisor's balloon behavior. + // Balloon 25% of memory above MinBalloonMemory (matches CH). if rec.Config.Memory >= hypervisor.MinBalloonMemory { balloonMiB := memMiB / hypervisor.DefaultBalloonDiv if err := putBalloon(ctx, hc, fcBalloon{ @@ -136,13 +132,10 @@ func (fc *Firecracker) configureVM(ctx context.Context, hc *http.Client, rec *hy return nil } -// launchProcess starts the firecracker binary with --api-sock, -// creates a PTY pair for the serial console, starts a background -// relay process for console.sock, writes the PID file, and waits -// for the API socket. +// launchProcess starts firecracker, sets up PTY+console relay, waits for socket. func (fc *Firecracker) launchProcess(ctx context.Context, rec *hypervisor.VMRecord, sockPath string, withNetwork bool) (int, error) { fcLog := filepath.Join(rec.LogDir, "firecracker.log") - // FC requires the log file to exist before startup (opens O_WRONLY|O_APPEND, no O_CREATE). + // FC opens log O_WRONLY|O_APPEND without O_CREATE — touch first. if f, createErr := os.Create(fcLog); createErr == nil { //nolint:gosec _ = f.Close() } diff --git a/hypervisor/snapshot.go b/hypervisor/snapshot.go index ae791fe..4b08c9f 100644 --- a/hypervisor/snapshot.go +++ b/hypervisor/snapshot.go @@ -11,16 +11,10 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// SnapshotMetaFile is the cocoon-owned sidecar persisted alongside the -// hypervisor's native snapshot files. It carries data the hypervisor's -// own config schema cannot hold (Role/MountPoint/FSType/DirectIO on disks) -// and FC-specific resource numbers (CPU/Memory) that are not in the -// vmstate binary. +// SnapshotMetaFile is the cocoon-owned sidecar carrying fields the hypervisor's +// native config can't hold (Role/MountPoint/FSType/DirectIO; FC CPU/Memory). const SnapshotMetaFile = "cocoon.json" -// SnapshotMeta is the on-disk shape of the cocoon sidecar. CH leaves -// CPU/Memory zero (omitempty drops them); FC fills them in because FC -// snapshot/load cannot resize the guest after restore. type SnapshotMeta struct { StorageConfigs []*types.StorageConfig `json:"storage_configs"` BootConfig *types.BootConfig `json:"boot_config,omitempty"` @@ -28,15 +22,10 @@ type SnapshotMeta struct { Memory int64 `json:"memory,omitempty"` } -// SaveSnapshotMeta atomically writes meta to dir/SnapshotMetaFile. Atomic -// rename matters: clone/restore later read the sidecar after this write, -// and a partial file from a crashed cocoon would surface as a JSON decode -// error blocking those flows. func SaveSnapshotMeta(dir string, meta *SnapshotMeta) error { return utils.AtomicWriteJSON(filepath.Join(dir, SnapshotMetaFile), meta) } -// LoadSnapshotMeta reads and decodes dir/SnapshotMetaFile. func LoadSnapshotMeta(dir string) (*SnapshotMeta, error) { data, err := os.ReadFile(filepath.Join(dir, SnapshotMetaFile)) //nolint:gosec if err != nil { @@ -49,10 +38,6 @@ func LoadSnapshotMeta(dir string) (*SnapshotMeta, error) { return &meta, nil } -// LoadAndValidateMeta loads the sidecar from dir and rejects paths that -// escape rootDir or runDir. Both backends use this entry point so an -// imported snapshot's cocoon.json cannot smuggle paths outside cocoon-managed -// roots before any subsequent open/rename operation runs. func LoadAndValidateMeta(dir, rootDir, runDir string) (*SnapshotMeta, error) { meta, err := LoadSnapshotMeta(dir) if err != nil { @@ -64,9 +49,6 @@ func LoadAndValidateMeta(dir, rootDir, runDir string) (*SnapshotMeta, error) { return meta, nil } -// CloneStorageConfigs returns a deep copy of storageConfigs suitable for -// embedding in an on-disk sidecar — subsequent mutations on the live VM -// record won't taint the persisted JSON. func CloneStorageConfigs(storageConfigs []*types.StorageConfig) []*types.StorageConfig { out := make([]*types.StorageConfig, 0, len(storageConfigs)) for _, sc := range storageConfigs { @@ -76,10 +58,8 @@ func CloneStorageConfigs(storageConfigs []*types.StorageConfig) []*types.Storage return out } -// IsUnderDir reports whether path resolves to a location strictly under dir. -// Used to reject snapshot-imported paths that escape cocoon-managed roots. -// An empty dir always returns false to disable the check rather than match -// every path. +// IsUnderDir reports whether path is strictly under dir. An empty dir returns +// false (disables the check) rather than matching every path. func IsUnderDir(path, dir string) bool { if dir == "" { return false @@ -89,9 +69,8 @@ func IsUnderDir(path, dir string) bool { return strings.HasPrefix(cleaned, root+string(filepath.Separator)) } -// ValidateMetaPaths rejects sidecar paths that escape cocoon-managed roots. -// Snapshots may be imported across hosts with arbitrary cocoon.json content, -// so every path is checked before subsequent code opens the file. +// ValidateMetaPaths rejects sidecar paths escaping cocoon-managed roots; an +// imported snapshot's cocoon.json is otherwise untrusted. func ValidateMetaPaths(meta *SnapshotMeta, rootDir, runDir string) error { for _, sc := range meta.StorageConfigs { if !IsUnderDir(sc.Path, rootDir) && !IsUnderDir(sc.Path, runDir) { @@ -109,9 +88,8 @@ func ValidateMetaPaths(meta *SnapshotMeta, rootDir, runDir string) error { return nil } -// ReverseLayers walks storageConfigs once, collecting Role==Layer entries in -// reverse order via project. The reversed result mirrors overlayfs lowerdir -// semantics where the topmost layer comes first. +// ReverseLayers projects Role==Layer entries through fn in reverse order +// (topmost layer first, matching overlayfs lowerdir semantics). func ReverseLayers[T any](storageConfigs []*types.StorageConfig, project func(idx int, sc *types.StorageConfig) T) []T { var layers []*types.StorageConfig for _, sc := range storageConfigs { diff --git a/hypervisor/snapshot_meta_test.go b/hypervisor/snapshot_test.go similarity index 100% rename from hypervisor/snapshot_meta_test.go rename to hypervisor/snapshot_test.go diff --git a/utils/map.go b/utils/map.go index 19090a7..b859303 100644 --- a/utils/map.go +++ b/utils/map.go @@ -5,11 +5,8 @@ import ( "maps" ) -// LookupCopy returns a shallow copy of the value at key in m. -// Returns an error if the key is absent or the stored pointer is nil. -// NOTE: this is a shallow copy — pointer, slice, and map fields inside T -// still reference the original data. Callers must not mutate such fields -// on the returned value without additional deep-copy logic. +// LookupCopy returns a shallow copy at key. Pointer/slice/map fields inside T +// still alias the original — callers must not mutate them without deep-copying. func LookupCopy[T any](m map[string]*T, key string) (T, error) { v := m[key] if v == nil { @@ -33,7 +30,6 @@ func MergeSets[K comparable](sets ...map[K]struct{}) map[K]struct{} { } // MapValues projects every non-nil value in m through fn into a slice. -// Used by List endpoints to materialize record maps without hand-rolled loops. func MapValues[K comparable, V, R any](m map[K]*V, fn func(*V) R) []R { if len(m) == 0 { return nil diff --git a/utils/stream.go b/utils/stream.go index 12eab20..acb9fc3 100644 --- a/utils/stream.go +++ b/utils/stream.go @@ -69,10 +69,9 @@ func TarDirStreamWithRemove(dir string) io.ReadCloser { }) } -// PeekReader reads up to n bytes from the head of r and returns those bytes -// alongside a reader that re-emits them followed by the rest of r. A short -// read at EOF is not an error — the caller checks len(head). Used to sniff -// magic bytes (gzip, qcow2) without forfeiting the stream. +// PeekReader peeks up to n bytes and returns them along with a reader that +// re-emits the head followed by the rest of r. A short read at EOF is not an +// error — caller checks len(head). func PeekReader(r io.Reader, n int) ([]byte, io.Reader, error) { head := make([]byte, n) actual, err := io.ReadFull(r, head) From ce1599eca1254ab2f85bf802680a355364910837 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 13:33:16 +0800 Subject: [PATCH 21/22] refactor(hypervisor): reorder backend.go layout per SKILL.md Consolidate types at top, group all Backend public methods, then private, then standalone funcs. Eliminates the 15 layout violations introduced when LaunchVMProcess / RestoreSequence / DirectRestoreSequence were appended at end of file in the recent template refactor. No behavior change. --- hypervisor/backend.go | 191 ++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 101 deletions(-) diff --git a/hypervisor/backend.go b/hypervisor/backend.go index 2d84bef..0d4baf7 100644 --- a/hypervisor/backend.go +++ b/hypervisor/backend.go @@ -20,7 +20,6 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Shared constants for all hypervisor backends. const ( APISocketName = "api.sock" ConsoleSockName = "console.sock" @@ -34,15 +33,13 @@ const ( // VMMemTransferTimeout is the single-shot timeout for snapshot/restore API calls. VMMemTransferTimeout = 10 * time.Minute - // MinBalloonMemory is the minimum guest memory (256 MiB) below which - // balloon is not enabled — the overhead is not worthwhile for tiny VMs. + // MinBalloonMemory: balloon overhead is not worthwhile below 256 MiB guest memory. MinBalloonMemory = 256 << 20 // DefaultBalloonDiv sizes the initial balloon as memory/DefaultBalloonDiv (25%). DefaultBalloonDiv = 4 - // GracefulStopPollInterval is how often we check if the guest has powered off - // after sending a graceful shutdown signal (ACPI power-button or SendCtrlAltDel). + // GracefulStopPollInterval polls between graceful shutdown signal and timeout escalation. GracefulStopPollInterval = 500 * time.Millisecond ) @@ -68,6 +65,40 @@ type Backend struct { Locker lock.Locker } +// LaunchSpec is the per-call input to Backend.LaunchVMProcess. Shared +// BinaryName / SocketWaitTimeout come from BackendConfig. +type LaunchSpec struct { + Cmd *exec.Cmd + PIDPath string + SockPath string + NetnsPath string // empty = host netns + OnFail func() // optional cleanup on any error path +} + +// RestoreSpec carries the backend-specific hooks for Backend.RestoreSequence. +type RestoreSpec struct { + VMCfg *types.VMConfig + Snapshot io.Reader + OverrideCheck func(rec *VMRecord, vmCfg *types.VMConfig) error + Preflight func(stagingDir string, rec *VMRecord) error + Kill func(ctx context.Context, vmID string, rec *VMRecord) error + Wrap func(rec *VMRecord, fn func() error) error // optional disk lock around merge+afterExtract + BeforeMerge func(rec *VMRecord) error // e.g. FC removes stale COW + AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) +} + +// DirectRestoreSpec is RestoreSpec for a local srcDir rather than a tar; Populate replaces staging+merge. +type DirectRestoreSpec struct { + VMCfg *types.VMConfig + SrcDir string + OverrideCheck func(rec *VMRecord, vmCfg *types.VMConfig) error + Preflight func(srcDir string, rec *VMRecord) error + Kill func(ctx context.Context, vmID string, rec *VMRecord) error + Wrap func(rec *VMRecord, fn func() error) error + Populate func(rec *VMRecord, srcDir string) error + AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) +} + func (b *Backend) Type() string { return b.Typ } // Inspect returns VM info for a single VM by ref (ID, name, or prefix). @@ -230,9 +261,8 @@ func (b *Backend) ReserveVM(ctx context.Context, id string, vmCfg *types.VMConfi }) } -// CloneSetup handles the shared pre-clone sequence used by both -// backends' Clone and DirectClone entry points: validate CPU, -// reserve a placeholder record, create dirs, and return a cleanup function. +// CloneSetup is the shared pre-clone sequence: validate CPU, reserve a +// placeholder, ensure dirs, return a cleanup that rolls back both. func (b *Backend) CloneSetup(ctx context.Context, vmID string, vmCfg *types.VMConfig, snapshotConfig *types.SnapshotConfig) (runDir, logDir string, now time.Time, cleanup func(), err error) { if err = ValidateHostCPU(vmCfg.CPU); err != nil { return "", "", time.Time{}, nil, err @@ -278,9 +308,8 @@ func (b *Backend) ForEachVM(ctx context.Context, ids []string, op string, fn fun return result.Succeeded, result.Err() } -// KillForRestore stops a running VM and cleans up its runtime files in -// preparation for a restore. The terminate callback performs the -// backend-specific shutdown (e.g. CH graceful shutdown vs FC direct kill). +// KillForRestore stops the running VM via the backend-specific terminate hook +// and clears runtime files. func (b *Backend) KillForRestore(ctx context.Context, vmID string, rec *VMRecord, terminate func(pid int) error, runtimeFiles []string) error { killErr := b.WithRunningVM(ctx, rec, terminate) if killErr != nil && !errors.Is(killErr, ErrNotRunning) { @@ -291,9 +320,8 @@ func (b *Backend) KillForRestore(ctx context.Context, vmID string, rec *VMRecord return nil } -// DirectCloneBase runs the shared DirectClone sequence: reserve a -// placeholder via CloneSetup, copy snapshot files via cloneFiles, then -// hand off to afterExtract for backend-specific startup. +// DirectCloneBase: reserve placeholder via CloneSetup, copy snapshot files, +// then hand off to afterExtract. func (b *Backend) DirectCloneBase( ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, snapshotConfig *types.SnapshotConfig, srcDir string, @@ -317,9 +345,8 @@ func (b *Backend) DirectCloneBase( return afterExtract(ctx, vmID, vmCfg, networkConfigs, runDir, logDir, now) } -// CloneFromStream runs the shared streaming Clone sequence: reserve a -// placeholder via CloneSetup, extract the snapshot tar into runDir, then -// hand off to afterExtract for backend-specific startup. +// CloneFromStream: reserve placeholder via CloneSetup, extract tar into runDir, +// then hand off to afterExtract. func (b *Backend) CloneFromStream( ctx context.Context, vmID string, vmCfg *types.VMConfig, networkConfigs []*types.NetworkConfig, snapshotConfig *types.SnapshotConfig, snapshot io.Reader, @@ -342,7 +369,7 @@ func (b *Backend) CloneFromStream( return afterExtract(ctx, vmID, vmCfg, networkConfigs, runDir, logDir, now) } -// ResolveForRestore resolves VM ref and validates it's running. +// ResolveForRestore resolves vmRef and validates the VM is running. func (b *Backend) ResolveForRestore(ctx context.Context, vmRef string) (string, *VMRecord, error) { vmID, err := b.ResolveRef(ctx, vmRef) if err != nil { @@ -358,9 +385,7 @@ func (b *Backend) ResolveForRestore(ctx context.Context, vmRef string) (string, return vmID, &rec, nil } -// GracefulStop sends a shutdown signal, polls until the process exits, -// and escalates via the escalate closure if the timeout fires. -// Used by both CH (ACPI power-button) and FC (SendCtrlAltDel). +// GracefulStop signals shutdown, polls until exit, escalates on timeout. func (b *Backend) GracefulStop(ctx context.Context, vmID string, pid int, timeout time.Duration, signal, escalate func() error) error { logger := log.WithFunc(b.Typ + ".GracefulStop") if err := signal(); err != nil { @@ -380,14 +405,13 @@ func (b *Backend) GracefulStop(ctx context.Context, vmID string, pid int, timeou return escalate() } -// AbortLaunch terminates a failed launch and removes runtime files. +// AbortLaunch terminates a failed launch and clears runtime files. func (b *Backend) AbortLaunch(ctx context.Context, pid int, sockPath, runDir string, runtimeFiles []string) { _ = utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()) CleanupRuntimeFiles(ctx, runDir, runtimeFiles) } -// StartAll is the shared Start template: resolve refs, run startOne per ID, -// flip succeeded records to Running. Both backends call it as a one-liner. +// StartAll: resolve refs, run startOne per ID, flip succeeded to Running. func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(context.Context, string) error) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { @@ -400,8 +424,7 @@ func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(con return succeeded, forEachErr } -// StopAll is the shared Stop template: resolve refs, run stopOne per ID, -// flip succeeded records to Stopped. +// StopAll: resolve refs, run stopOne per ID, flip succeeded to Stopped. func (b *Backend) StopAll(ctx context.Context, refs []string, stopOne func(context.Context, string) error) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { @@ -414,10 +437,9 @@ func (b *Backend) StopAll(ctx context.Context, refs []string, stopOne func(conte return succeeded, forEachErr } -// DeleteAll is the shared Delete template: gracefully stop a running VM when -// force=true via the supplied stopOne, remove the VM dirs, then delete the -// index record. The dir-cleanup intentionally precedes the DB delete so a -// failed cleanup leaves the record intact and the user can retry rm. +// DeleteAll: optionally stop a running VM (force), remove dirs, then delete +// the index record. Dir cleanup precedes DB delete so a failed cleanup leaves +// the record intact for retry. func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stopOne func(context.Context, string) error) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { @@ -488,8 +510,7 @@ func (b *Backend) CleanStalePlaceholders(_ context.Context, ids []string) error }) } -// GCCollect removes orphan VM directories and stale DB records. -// Kills leftover hypervisor processes before removing directories. +// GCCollect kills leftover hypervisor processes and removes orphan dirs/records. // Runs under the GC orchestrator's flock — uses lock-free DB access. func (b *Backend) GCCollect(ctx context.Context, ids []string) error { var errs []error @@ -551,7 +572,7 @@ func (b *Backend) BuildSnapshotConfig(snapID string, rec *VMRecord) *types.Snaps return cfg } -// FinalizeRestore updates DB and assembles returned VM after restore. +// FinalizeRestore updates DB and assembles the returned VM after restore. func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord, pid int) (*types.VM, error) { now := time.Now() if err := b.DB.Update(ctx, func(idx *VMIndex) error { @@ -578,7 +599,7 @@ func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types return &info, nil } -// PrepareStart loads record, validates state, ensures dirs are ready. +// PrepareStart loads the record, verifies not-running, ensures dirs exist. func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []string) (*VMRecord, error) { rec, err := b.LoadRecord(ctx, id) if err != nil { @@ -602,7 +623,7 @@ func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []st return &rec, nil } -// FinalizeCreate writes populated VM record to DB, replacing placeholder. +// FinalizeCreate writes a populated VM record to DB, replacing the placeholder. func (b *Backend) FinalizeCreate(ctx context.Context, id string, info *types.VM, bootCfg *types.BootConfig, blobIDs map[string]struct{}) error { return b.DB.Update(ctx, func(idx *VMIndex) error { existing := idx.VMs[id] @@ -645,32 +666,6 @@ func (b *Backend) HandleStopResult(ctx context.Context, id, runDir string, runti return nil } -// killOrphanProcess terminates a leftover hypervisor process if the PID -// file exists and the process matches the expected binary name. -func (b *Backend) killOrphanProcess(ctx context.Context, runDir string) { - pid, err := utils.ReadPIDFile(b.PIDFilePath(runDir)) - if err != nil { - return - } - sockPath := SocketPath(runDir) - if !utils.VerifyProcessCmdline(pid, b.Conf.BinaryName(), sockPath) { - return - } - _ = utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()) -} - -func SocketPath(runDir string) string { return filepath.Join(runDir, APISocketName) } - -func ConsoleSockPath(runDir string) string { return filepath.Join(runDir, ConsoleSockName) } - -type LaunchSpec struct { - Cmd *exec.Cmd - PIDPath string - SockPath string - NetnsPath string // empty = host netns - OnFail func() // optional cleanup on any error path -} - // LaunchVMProcess starts spec.Cmd and waits for the API socket. On any error // after Start, the process is killed and the PID file is removed. Caller // reaps cmd via cmd.Wait() in a goroutine on success. @@ -719,46 +714,6 @@ func (b *Backend) LaunchVMProcess(ctx context.Context, spec LaunchSpec) (pid int return pid, nil } -// PrepareStagingDir extracts the snapshot tar into a sibling staging dir. -func PrepareStagingDir(runDir string, snapshot io.Reader) (stagingDir string, cleanup func(), err error) { - stagingDir = runDir + ".restore-staging" - if err = os.RemoveAll(stagingDir); err != nil { - return "", nil, fmt.Errorf("clear staging dir: %w", err) - } - if err = os.MkdirAll(stagingDir, 0o700); err != nil { - return "", nil, fmt.Errorf("create staging dir: %w", err) - } - cleanup = func() { os.RemoveAll(stagingDir) } //nolint:errcheck,gosec - if err = utils.ExtractTar(stagingDir, snapshot); err != nil { - cleanup() - return "", nil, fmt.Errorf("extract snapshot: %w", err) - } - return stagingDir, cleanup, nil -} - -type RestoreSpec struct { - VMCfg *types.VMConfig - Snapshot io.Reader - OverrideCheck func(rec *VMRecord, vmCfg *types.VMConfig) error - Preflight func(stagingDir string, rec *VMRecord) error - Kill func(ctx context.Context, vmID string, rec *VMRecord) error - Wrap func(rec *VMRecord, fn func() error) error // optional disk lock around merge+afterExtract - BeforeMerge func(rec *VMRecord) error // e.g. FC removes stale COW - AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) -} - -// DirectRestoreSpec is RestoreSpec for a local srcDir rather than a tar; Populate replaces staging+merge. -type DirectRestoreSpec struct { - VMCfg *types.VMConfig - SrcDir string - OverrideCheck func(rec *VMRecord, vmCfg *types.VMConfig) error - Preflight func(srcDir string, rec *VMRecord) error - Kill func(ctx context.Context, vmID string, rec *VMRecord) error - Wrap func(rec *VMRecord, fn func() error) error - Populate func(rec *VMRecord, srcDir string) error - AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) -} - // RestoreSequence: validate CPU → resolve → stage → preflight → kill → merge → AfterExtract. func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec RestoreSpec) (*types.VM, error) { if err := ValidateHostCPU(spec.VMCfg.CPU); err != nil { @@ -853,3 +808,37 @@ func (b *Backend) DirectRestoreSequence(ctx context.Context, vmRef string, spec } return result, nil } + +// killOrphanProcess terminates a leftover hypervisor process if PID matches the binary. +func (b *Backend) killOrphanProcess(ctx context.Context, runDir string) { + pid, err := utils.ReadPIDFile(b.PIDFilePath(runDir)) + if err != nil { + return + } + sockPath := SocketPath(runDir) + if !utils.VerifyProcessCmdline(pid, b.Conf.BinaryName(), sockPath) { + return + } + _ = utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()) +} + +func SocketPath(runDir string) string { return filepath.Join(runDir, APISocketName) } + +func ConsoleSockPath(runDir string) string { return filepath.Join(runDir, ConsoleSockName) } + +// PrepareStagingDir extracts the snapshot tar into a sibling staging dir. +func PrepareStagingDir(runDir string, snapshot io.Reader) (stagingDir string, cleanup func(), err error) { + stagingDir = runDir + ".restore-staging" + if err = os.RemoveAll(stagingDir); err != nil { + return "", nil, fmt.Errorf("clear staging dir: %w", err) + } + if err = os.MkdirAll(stagingDir, 0o700); err != nil { + return "", nil, fmt.Errorf("create staging dir: %w", err) + } + cleanup = func() { os.RemoveAll(stagingDir) } //nolint:errcheck,gosec + if err = utils.ExtractTar(stagingDir, snapshot); err != nil { + cleanup() + return "", nil, fmt.Errorf("extract snapshot: %w", err) + } + return stagingDir, cleanup, nil +} From ee75245dd3c60fc477d71589b31cca56443bbaad Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 28 Apr 2026 13:59:10 +0800 Subject: [PATCH 22/22] =?UTF-8?q?refactor:=20final=20/code=20+=20/simplify?= =?UTF-8?q?=20pass=20=E2=80=94=20close=20audit=20loop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aggregate findings from 5 parallel audit agents (ordering/types/naming, errors+modernization, reuse, quality, efficiency) across 156 non-test files. All actionable items applied; net -3 LOC. Reuse / templates: - hypervisor.PreflightRestore template: load+validate sidecar, integrity callback, ValidateRoleSequence. CH/FC preflightRestore become 3-line delegations. - cmd/core.resolveOwner generic: ResolveImageOwner and resolveVMOwner collapse onto one helper parameterized by found-callback + sentinels. - firecracker.putJSON generic: 5 putXxx helpers collapse to thin wrappers. Efficiency: - snapshot/localfile.Delete batches the per-id store.Update into a single transaction (was N+1 flock cycles on M-ref delete). - cmd/vm.recoverNetwork uses one List + map lookup instead of M Inspect calls under DB lock. - network/cni: drop redundant LinkByName re-fetch when overrideMAC is set (already known value). Style / dead code: - Remove dead vmAPI / vmAPICall (unused after the *APIOnce migration). - Remove duplicate TODO(inspect) in cloudhypervisor/extend.go. - Drop firecracker PR-number reference from clone.go (per comment hygiene). - Restore exec-justification comment in cloudhypervisor/start.go. - utils/tar: errors.Is(err, io.EOF) instead of == comparison. - strconv.Itoa / FormatInt instead of fmt.Sprintf("%d", ...). - cloudhypervisor/snapshot: cowName via filepath.Base(cowPath) instead of branched literal. - cmd/vm: vmID[:8] → network.VMIDPrefix. Layout: - network/bridge/bridge_other.go: var before type. - images/cloudimg/inspect.go: var before type. make lint dual-platform 0 issues; make test all pass. --- cmd/core/helpers.go | 59 ++++++++++++------------ cmd/vm/attach.go | 62 +++++++++++--------------- cmd/vm/lifecycle.go | 18 +++++++- cmd/vm/run.go | 2 +- hypervisor/cloudhypervisor/extend.go | 13 +++--- hypervisor/cloudhypervisor/helper.go | 22 +++------ hypervisor/cloudhypervisor/restore.go | 11 +---- hypervisor/cloudhypervisor/snapshot.go | 5 +-- hypervisor/cloudhypervisor/start.go | 1 + hypervisor/cloudhypervisor/utils.go | 3 +- hypervisor/firecracker/api.go | 39 ++++++---------- hypervisor/firecracker/clone.go | 4 +- hypervisor/firecracker/helper.go | 19 ++++---- hypervisor/firecracker/start.go | 3 +- hypervisor/snapshot.go | 14 ++++++ images/cloudimg/inspect.go | 13 +++--- network/bridge/bridge_other.go | 4 +- network/cni/create_linux.go | 7 +-- snapshot/localfile/localfile.go | 24 +++++----- utils/tar.go | 5 ++- 20 files changed, 161 insertions(+), 167 deletions(-) diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index 6149785..07cc11e 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -270,19 +270,33 @@ func EnsureImage(ctx context.Context, backends []imagebackend.Images, vmCfg *typ } func ResolveImageOwner(ctx context.Context, backends []imagebackend.Images, ref string) (imagebackend.Images, error) { - var matches []imagebackend.Images - for _, b := range backends { + return resolveOwner(backends, ref, func(b imagebackend.Images) (bool, error) { img, err := b.Inspect(ctx, ref) + return img != nil, err + }, + fmt.Errorf("image %q: not found in any backend", ref), + fmt.Errorf("image %s: %w", ref, imagebackend.ErrAmbiguous), + ) +} + +// resolveOwner returns the unique backend among backends for which found +// reports true. notFound is returned when zero match; ambiguous wraps the +// caller-supplied error and lists the matched backend types. +func resolveOwner[T interface{ Type() string }](backends []T, ref string, found func(T) (bool, error), notFound, ambiguous error) (T, error) { + var matches []T + var zero T + for _, b := range backends { + ok, err := found(b) if err != nil { - return nil, fmt.Errorf("inspect %s in %s: %w", ref, b.Type(), err) + return zero, fmt.Errorf("inspect %s in %s: %w", ref, b.Type(), err) } - if img != nil { + if ok { matches = append(matches, b) } } switch len(matches) { case 0: - return nil, fmt.Errorf("image %q: not found in any backend", ref) + return zero, notFound case 1: return matches[0], nil default: @@ -290,7 +304,7 @@ func ResolveImageOwner(ctx context.Context, backends []imagebackend.Images, ref for i, b := range matches { names[i] = b.Type() } - return nil, fmt.Errorf("image %s: %w (backends: %s)", ref, imagebackend.ErrAmbiguous, strings.Join(names, ", ")) + return zero, fmt.Errorf("%w (backends: %s)", ambiguous, strings.Join(names, ", ")) } } @@ -503,30 +517,17 @@ func findHypervisorFactory(typ config.HypervisorType) func(*config.Config) (hype } func resolveVMOwner(ctx context.Context, hypers []hypervisor.Hypervisor, ref string) (hypervisor.Hypervisor, error) { - var matches []hypervisor.Hypervisor - for _, h := range hypers { - _, resolveErr := h.Inspect(ctx, ref) - if resolveErr == nil { - matches = append(matches, h) - continue + return resolveOwner(hypers, ref, func(h hypervisor.Hypervisor) (bool, error) { + if _, err := h.Inspect(ctx, ref); err == nil { + return true, nil + } else if !errors.Is(err, hypervisor.ErrNotFound) { + return false, err } - if errors.Is(resolveErr, hypervisor.ErrNotFound) { - continue - } - return nil, fmt.Errorf("inspect %s in %s: %w", ref, h.Type(), resolveErr) - } - switch len(matches) { - case 0: - return nil, fmt.Errorf("vm %s: %w", ref, hypervisor.ErrNotFound) - case 1: - return matches[0], nil - default: - names := make([]string, len(matches)) - for i, h := range matches { - names[i] = h.Type() - } - return nil, fmt.Errorf("vm %s: %w (backends: %s)", ref, hypervisor.ErrAmbiguous, strings.Join(names, ", ")) - } + return false, nil + }, + fmt.Errorf("vm %s: %w", ref, hypervisor.ErrNotFound), + fmt.Errorf("vm %s: %w", ref, hypervisor.ErrAmbiguous), + ) } // sanitizeVMName derives a safe VM name from an image reference. diff --git a/cmd/vm/attach.go b/cmd/vm/attach.go index 2ba3f92..4279e42 100644 --- a/cmd/vm/attach.go +++ b/cmd/vm/attach.go @@ -1,6 +1,7 @@ package vm import ( + "context" "errors" "fmt" @@ -14,18 +15,10 @@ import ( ) func (h Handler) FsAttach(cmd *cobra.Command, args []string) error { - ctx, conf, err := h.Init(cmd) + ctx, a, err := resolveAttacher[fs.Attacher](h, cmd, args, "fs attach", fs.ErrUnsupportedBackend) if err != nil { return err } - hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) - if err != nil { - return fmt.Errorf("fs attach: %w", err) - } - a, ok := hyper.(fs.Attacher) - if !ok { - return fmt.Errorf("fs attach: backend %s: %w", hyper.Type(), fs.ErrUnsupportedBackend) - } socket, _ := cmd.Flags().GetString("socket") tag, _ := cmd.Flags().GetString("tag") numQ, _ := cmd.Flags().GetInt("num-queues") @@ -42,18 +35,10 @@ func (h Handler) FsAttach(cmd *cobra.Command, args []string) error { } func (h Handler) FsDetach(cmd *cobra.Command, args []string) error { - ctx, conf, err := h.Init(cmd) + ctx, a, err := resolveAttacher[fs.Attacher](h, cmd, args, "fs detach", fs.ErrUnsupportedBackend) if err != nil { return err } - hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) - if err != nil { - return fmt.Errorf("fs detach: %w", err) - } - a, ok := hyper.(fs.Attacher) - if !ok { - return fmt.Errorf("fs detach: backend %s: %w", hyper.Type(), fs.ErrUnsupportedBackend) - } tag, _ := cmd.Flags().GetString("tag") if err := a.FsDetach(ctx, args[0], tag); err != nil { return classifyAttachErr(err) @@ -66,18 +51,10 @@ func (h Handler) FsDetach(cmd *cobra.Command, args []string) error { } func (h Handler) DeviceAttach(cmd *cobra.Command, args []string) error { - ctx, conf, err := h.Init(cmd) + ctx, a, err := resolveAttacher[vfio.Attacher](h, cmd, args, "device attach", vfio.ErrUnsupportedBackend) if err != nil { return err } - hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) - if err != nil { - return fmt.Errorf("device attach: %w", err) - } - a, ok := hyper.(vfio.Attacher) - if !ok { - return fmt.Errorf("device attach: backend %s: %w", hyper.Type(), vfio.ErrUnsupportedBackend) - } pci, _ := cmd.Flags().GetString("pci") id, _ := cmd.Flags().GetString("id") deviceID, err := a.DeviceAttach(ctx, args[0], vfio.Spec{PCI: pci, ID: id}) @@ -92,18 +69,10 @@ func (h Handler) DeviceAttach(cmd *cobra.Command, args []string) error { } func (h Handler) DeviceDetach(cmd *cobra.Command, args []string) error { - ctx, conf, err := h.Init(cmd) + ctx, a, err := resolveAttacher[vfio.Attacher](h, cmd, args, "device detach", vfio.ErrUnsupportedBackend) if err != nil { return err } - hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) - if err != nil { - return fmt.Errorf("device detach: %w", err) - } - a, ok := hyper.(vfio.Attacher) - if !ok { - return fmt.Errorf("device detach: backend %s: %w", hyper.Type(), vfio.ErrUnsupportedBackend) - } id, _ := cmd.Flags().GetString("id") if err := a.DeviceDetach(ctx, args[0], id); err != nil { return classifyAttachErr(err) @@ -115,6 +84,27 @@ func (h Handler) DeviceDetach(cmd *cobra.Command, args []string) error { return nil } +// resolveAttacher resolves args[0] to a hypervisor and asserts it implements +// A (fs.Attacher / vfio.Attacher). The op string ("fs attach", "device detach") +// prefixes both error wraps so the four handlers no longer repeat the +// Init→FindHypervisor→type-assert boilerplate. +func resolveAttacher[A any](h Handler, cmd *cobra.Command, args []string, op string, errUnsupported error) (context.Context, A, error) { + var zero A + ctx, conf, err := h.Init(cmd) + if err != nil { + return ctx, zero, err + } + hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) + if err != nil { + return ctx, zero, fmt.Errorf("%s: %w", op, err) + } + a, ok := hyper.(A) + if !ok { + return ctx, zero, fmt.Errorf("%s: backend %s: %w", op, hyper.Type(), errUnsupported) + } + return ctx, a, nil +} + // classifyAttachErr surfaces ErrNotRunning more clearly than the generic wrap. func classifyAttachErr(err error) error { if errors.Is(err, hypervisor.ErrNotRunning) { diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 21e3a42..ad97409 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -238,9 +238,23 @@ func (h Handler) recoverNetwork(ctx context.Context, conf *config.Config, hyper // Cache bridge providers by device name to avoid redundant netlink lookups. bridgeProviders := map[string]network.Network{} + // Single List → map by ID/name avoids one Inspect-per-ref under DB lock. + all, err := hyper.List(ctx) + if err != nil { + logger.Warnf(ctx, "list VMs for recovery: %v", err) + return + } + byRef := make(map[string]*types.VM, len(all)*2) + for _, vm := range all { + byRef[vm.ID] = vm + if vm.Config.Name != "" { + byRef[vm.Config.Name] = vm + } + } + for _, ref := range refs { - vm, err := hyper.Inspect(ctx, ref) - if err != nil || vm == nil || len(vm.NetworkConfigs) == 0 { + vm := byRef[ref] + if vm == nil || len(vm.NetworkConfigs) == 0 { continue } diff --git a/cmd/vm/run.go b/cmd/vm/run.go index e72f3e4..a8a52ea 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -246,7 +246,7 @@ func (h Handler) prepareClone(ctx context.Context, cmd *cobra.Command, conf *con } vmID := utils.GenerateID() if vmCfg.Name == "" { - vmCfg.Name = "cocoon-clone-" + vmID[:8] + vmCfg.Name = "cocoon-clone-" + network.VMIDPrefix(vmID) } // Auto-pull base image if --pull is set (cross-node clone). diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index 1d80dcc..024b92e 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -67,10 +67,6 @@ func (ch *CloudHypervisor) FsDetach(ctx context.Context, vmRef, tag string) erro } // FsList enumerates currently attached vhost-user-fs devices. -// -// TODO(inspect): cmd/vm Inspect calls FsList and DeviceList back-to-back, -// each fetching its own vm.info. A combined Lister returning both arrays -// from a single vm.info call would halve the round-trips on a running VM. func (ch *CloudHypervisor) FsList(ctx context.Context, vmRef string) ([]fs.Attached, error) { return listWith(ctx, ch, vmRef, func(info *chVMInfoResponse) []fs.Attached { out := make([]fs.Attached, 0, len(info.Config.Fs)) @@ -129,8 +125,6 @@ func (ch *CloudHypervisor) DeviceDetach(ctx context.Context, vmRef, id string) e } // DeviceList enumerates currently attached VFIO PCI passthrough devices. -// -// TODO(inspect): see FsList note — combined Lister would dedupe vm.info. func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio.Attached, error) { return listWith(ctx, ch, vmRef, func(info *chVMInfoResponse) []vfio.Attached { out := make([]vfio.Attached, 0, len(info.Config.Devices)) @@ -187,6 +181,13 @@ func (ch *CloudHypervisor) attachWith( if pci.ID != "" { return pci.ID, nil } + // CH always returns 200 with PciDeviceInfo on add-fs/add-device, so this + // path means we accepted the alt 204 success code and lost the body. The + // fallback covers the user-supplied id; an empty fallback (e.g. VFIO with + // no --id) would otherwise leave the caller without a detach key. + if fallbackID == "" { + return "", fmt.Errorf("%s: empty response body and no fallback id (CH returned no PciDeviceInfo)", endpoint) + } return fallbackID, nil } diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 69c3822..627deb3 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -93,21 +93,10 @@ func hasMemoryRangeFile(srcDir string) (bool, error) { return false, nil } -func vmAPI(ctx context.Context, hc *http.Client, endpoint string, body []byte, successCodes ...int) error { - _, err := vmAPICall(ctx, hc, endpoint, body, successCodes...) - return err -} - -// vmAPICall returns the raw response body so callers that need to decode -// PciDeviceInfo (vm.add-fs, vm.add-device, ...) can use the same retry path. -func vmAPICall(ctx context.Context, hc *http.Client, endpoint string, body []byte, successCodes ...int) ([]byte, error) { - return utils.DoAPIWithRetry(ctx, hc, http.MethodPut, "http://localhost/api/v1/"+endpoint, body, successCodes...) -} - -// vmAPIOnce sends a single PUT without DoWithRetry. Use for non-idempotent -// endpoints — e.g. vm.add-fs / vm.add-device — where a retry after a -// network drop or 5xx that landed on CH after the device was already added -// would surface as a misleading "duplicate id" rejection. +// vmAPIOnce sends a single PUT without DoWithRetry. Used for non-idempotent +// endpoints where a retry after a lost response could mask the original +// success as a misleading "duplicate id" / wrong-state rejection. Returns the +// raw body so add-fs/add-device callers can decode PciDeviceInfo. func vmAPIOnce(ctx context.Context, hc *http.Client, endpoint string, body []byte, successCodes ...int) ([]byte, error) { return utils.DoAPIOnce(ctx, hc, http.MethodPut, "http://localhost/api/v1/"+endpoint, body, successCodes...) } @@ -225,7 +214,8 @@ func decodePciDeviceInfo(resp []byte) (chPciDeviceInfo, error) { } func powerButton(ctx context.Context, hc *http.Client) error { - return vmAPI(ctx, hc, "vm.power-button", nil) + _, err := vmAPIOnce(ctx, hc, "vm.power-button", nil) + return err } // queryConsolePTY retrieves the virtio-console PTY path from a running CH diff --git a/hypervisor/cloudhypervisor/restore.go b/hypervisor/cloudhypervisor/restore.go index 5f5310b..a47e847 100644 --- a/hypervisor/cloudhypervisor/restore.go +++ b/hypervisor/cloudhypervisor/restore.go @@ -26,17 +26,8 @@ func (ch *CloudHypervisor) Restore(ctx context.Context, vmRef string, vmCfg *typ }) } -// preflightRestore validates the sidecar and asserts the snapshot's role -// sequence is a valid prefix of rec (cidata-only suffix is the one extension). func (ch *CloudHypervisor) preflightRestore(srcDir string, rec *hypervisor.VMRecord) error { - meta, err := hypervisor.LoadAndValidateMeta(srcDir, ch.conf.RootDir, ch.conf.Config.RunDir) - if err != nil { - return err - } - if err := validateSnapshotIntegrity(srcDir, meta.StorageConfigs); err != nil { - return err - } - return hypervisor.ValidateRoleSequence(meta.StorageConfigs, rec.StorageConfigs) + return hypervisor.PreflightRestore(srcDir, ch.conf.RootDir, ch.conf.Config.RunDir, rec, validateSnapshotIntegrity) } func (ch *CloudHypervisor) killForRestore(ctx context.Context, vmID string, rec *hypervisor.VMRecord) error { diff --git a/hypervisor/cloudhypervisor/snapshot.go b/hypervisor/cloudhypervisor/snapshot.go index f5845a6..c66db9b 100644 --- a/hypervisor/cloudhypervisor/snapshot.go +++ b/hypervisor/cloudhypervisor/snapshot.go @@ -32,10 +32,7 @@ func (ch *CloudHypervisor) Snapshot(ctx context.Context, ref string) (*types.Sna directBoot := isDirectBoot(rec.BootConfig) cowPath := ch.cowPath(vmID, directBoot) - cowName := "overlay.qcow2" - if directBoot { - cowName = "cow.raw" - } + cowName := filepath.Base(cowPath) tmpDir, err := os.MkdirTemp(ch.conf.VMRunDir(vmID), "snapshot-") if err != nil { diff --git a/hypervisor/cloudhypervisor/start.go b/hypervisor/cloudhypervisor/start.go index 4404616..58d37b6 100644 --- a/hypervisor/cloudhypervisor/start.go +++ b/hypervisor/cloudhypervisor/start.go @@ -55,6 +55,7 @@ func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VM defer logFile.Close() //nolint:errcheck } + // shell out: the cloud-hypervisor binary is the authoritative VMM. cmd := exec.Command(ch.conf.CHBinary, args...) //nolint:gosec // Setpgid so CH survives if this process exits. cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} diff --git a/hypervisor/cloudhypervisor/utils.go b/hypervisor/cloudhypervisor/utils.go index 49d6c0a..c0820e5 100644 --- a/hypervisor/cloudhypervisor/utils.go +++ b/hypervisor/cloudhypervisor/utils.go @@ -7,6 +7,7 @@ import ( "io" "os" "path/filepath" + "strconv" "strings" "github.com/projecteru2/core/log" @@ -46,7 +47,7 @@ func qemuExpandImage(ctx context.Context, path string, targetSize int64, directB if targetSize <= virtualSize { return nil } - if err := utils.RunQemuImg(ctx, "resize", path, fmt.Sprintf("%d", targetSize)); err != nil { + if err := utils.RunQemuImg(ctx, "resize", path, strconv.FormatInt(targetSize, 10)); err != nil { return fmt.Errorf("resize %s: %w", path, err) } return nil diff --git a/hypervisor/firecracker/api.go b/hypervisor/firecracker/api.go index 9119343..485a9a6 100644 --- a/hypervisor/firecracker/api.go +++ b/hypervisor/firecracker/api.go @@ -114,44 +114,33 @@ func fcAPIOnce(ctx context.Context, hc *http.Client, method, endpoint string, bo return err } -func putMachineConfig(ctx context.Context, hc *http.Client, cfg fcMachineConfig) error { - body, err := json.Marshal(cfg) +// putJSON marshals payload and PUTs it to endpoint via fcAPI. +func putJSON[T any](ctx context.Context, hc *http.Client, endpoint string, payload T, kind string) error { + body, err := json.Marshal(payload) if err != nil { - return fmt.Errorf("marshal machine-config: %w", err) + return fmt.Errorf("marshal %s: %w", kind, err) } - return fcAPI(ctx, hc, "/machine-config", body) + return fcAPI(ctx, hc, endpoint, body) +} + +func putMachineConfig(ctx context.Context, hc *http.Client, cfg fcMachineConfig) error { + return putJSON(ctx, hc, "/machine-config", cfg, "machine-config") } func putBootSource(ctx context.Context, hc *http.Client, boot fcBootSource) error { - body, err := json.Marshal(boot) - if err != nil { - return fmt.Errorf("marshal boot-source: %w", err) - } - return fcAPI(ctx, hc, "/boot-source", body) + return putJSON(ctx, hc, "/boot-source", boot, "boot-source") } func putDrive(ctx context.Context, hc *http.Client, drive fcDrive) error { - body, err := json.Marshal(drive) - if err != nil { - return fmt.Errorf("marshal drive: %w", err) - } - return fcAPI(ctx, hc, "/drives/"+drive.DriveID, body) + return putJSON(ctx, hc, "/drives/"+drive.DriveID, drive, "drive") } func putBalloon(ctx context.Context, hc *http.Client, balloon fcBalloon) error { - body, err := json.Marshal(balloon) - if err != nil { - return fmt.Errorf("marshal balloon: %w", err) - } - return fcAPI(ctx, hc, "/balloon", body) + return putJSON(ctx, hc, "/balloon", balloon, "balloon") } func putNetworkInterface(ctx context.Context, hc *http.Client, iface fcNetworkInterface) error { - body, err := json.Marshal(iface) - if err != nil { - return fmt.Errorf("marshal network-interface: %w", err) - } - return fcAPI(ctx, hc, "/network-interfaces/"+iface.IfaceID, body) + return putJSON(ctx, hc, "/network-interfaces/"+iface.IfaceID, iface, "network-interface") } func putEntropy(ctx context.Context, hc *http.Client) error { @@ -171,7 +160,7 @@ func sendCtrlAltDel(ctx context.Context, hc *http.Client) error { if err != nil { return fmt.Errorf("marshal action: %w", err) } - return fcAPI(ctx, hc, "/actions", body) + return fcAPIOnce(ctx, hc, http.MethodPut, "/actions", body) } // pauseVM pauses a running FC instance via PATCH /vm. Non-idempotent: a diff --git a/hypervisor/firecracker/clone.go b/hypervisor/firecracker/clone.go index 30c263f..f1effd6 100644 --- a/hypervisor/firecracker/clone.go +++ b/hypervisor/firecracker/clone.go @@ -83,8 +83,8 @@ func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg } // FC snapshot/load requires drives at source absolute paths; only COW path - // changed, so symlink-redirect the source path until drive_overrides - // (firecracker-microvm/firecracker#5774) lands. + // changed, so symlink-redirect the source path until upstream supports + // drive overrides at load time. sockPath := hypervisor.SocketPath(runDir) withNetwork := len(networkConfigs) > 0 var pid int diff --git a/hypervisor/firecracker/helper.go b/hypervisor/firecracker/helper.go index 244eb3b..fc84016 100644 --- a/hypervisor/firecracker/helper.go +++ b/hypervisor/firecracker/helper.go @@ -6,21 +6,22 @@ import ( "path/filepath" "github.com/cocoonstack/cocoon/hypervisor" + "github.com/cocoonstack/cocoon/types" ) const pidFileName = "fc.pid" var runtimeFiles = []string{hypervisor.APISocketName, pidFileName, hypervisor.ConsoleSockName} -// preflightRestore is the FC preflight: common checks, role-sequence match, -// vmstate + mem files. FC vmstate is binary so the sidecar is the only -// cocoon-side disk-shape source — there's no chCfg counterpart. func (fc *Firecracker) preflightRestore(srcDir string, rec *hypervisor.VMRecord) error { - meta, err := hypervisor.LoadAndValidateMeta(srcDir, fc.conf.RootDir, fc.conf.Config.RunDir) - if err != nil { - return err - } - if err := hypervisor.ValidateSnapshotIntegrity(srcDir, meta.StorageConfigs); err != nil { + return hypervisor.PreflightRestore(srcDir, fc.conf.RootDir, fc.conf.Config.RunDir, rec, snapshotIntegrity) +} + +// snapshotIntegrity runs the cross-backend file/role checks then asserts the +// FC-specific vmstate+mem files exist (FC vmstate is binary so the sidecar is +// the only cocoon-side disk-shape source — no chCfg counterpart). +func snapshotIntegrity(srcDir string, sidecar []*types.StorageConfig) error { + if err := hypervisor.ValidateSnapshotIntegrity(srcDir, sidecar); err != nil { return err } for _, fname := range []string{snapshotVMStateFile, snapshotMemFile} { @@ -28,5 +29,5 @@ func (fc *Firecracker) preflightRestore(srcDir string, rec *hypervisor.VMRecord) return fmt.Errorf("snapshot file %s missing: %w", fname, statErr) } } - return hypervisor.ValidateRoleSequence(meta.StorageConfigs, rec.StorageConfigs) + return nil } diff --git a/hypervisor/firecracker/start.go b/hypervisor/firecracker/start.go index 3e0dbdc..8ef89bb 100644 --- a/hypervisor/firecracker/start.go +++ b/hypervisor/firecracker/start.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "syscall" "github.com/creack/pty" @@ -224,7 +225,7 @@ func (fc *Firecracker) startConsoleRelay(_ context.Context, runDir string, maste relayCmd := exec.Command(self) //nolint:gosec relayCmd.Env = []string{ relayEnvKey + "=1", - relayPIDEnvKey + "=" + fmt.Sprintf("%d", fcPID), + relayPIDEnvKey + "=" + strconv.Itoa(fcPID), } relayCmd.ExtraFiles = []*os.File{master, listenerFile} relayCmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} diff --git a/hypervisor/snapshot.go b/hypervisor/snapshot.go index 4b08c9f..4cb16e8 100644 --- a/hypervisor/snapshot.go +++ b/hypervisor/snapshot.go @@ -49,6 +49,20 @@ func LoadAndValidateMeta(dir, rootDir, runDir string) (*SnapshotMeta, error) { return meta, nil } +// PreflightRestore is the shared restore preflight: load+validate sidecar, +// run backend-specific integrity, then assert the snapshot's role sequence +// is a valid prefix of rec. +func PreflightRestore(srcDir, rootDir, runDir string, rec *VMRecord, integrity func(srcDir string, sidecar []*types.StorageConfig) error) error { + meta, err := LoadAndValidateMeta(srcDir, rootDir, runDir) + if err != nil { + return err + } + if err := integrity(srcDir, meta.StorageConfigs); err != nil { + return err + } + return ValidateRoleSequence(meta.StorageConfigs, rec.StorageConfigs) +} + func CloneStorageConfigs(storageConfigs []*types.StorageConfig) []*types.StorageConfig { out := make([]*types.StorageConfig, 0, len(storageConfigs)) for _, sc := range storageConfigs { diff --git a/images/cloudimg/inspect.go b/images/cloudimg/inspect.go index 2755d45..f1de2bf 100644 --- a/images/cloudimg/inspect.go +++ b/images/cloudimg/inspect.go @@ -12,12 +12,6 @@ import ( "os/exec" ) -type sourceImageInfo struct { - Format string // "qcow2" or "raw" - Compat string // qcow2 compat level (e.g. "0.10", "1.1"); empty for non-qcow2 - HasBackingFile bool -} - var ( // qcow2Magic is the qcow2 file signature. qcow2Magic = []byte{'Q', 'F', 'I', 0xfb} @@ -33,6 +27,7 @@ var ( {[]byte("