@@ -3,6 +3,7 @@ package cloudhypervisor
33import (
44 "context"
55 "errors"
6+ "time"
67
78 "github.com/projecteru2/cocoon/gc"
89 "github.com/projecteru2/cocoon/hypervisor"
@@ -13,10 +14,15 @@ import (
1314// compile-time interface check.
1415var _ hypervisor.Hypervisor = (* CloudHypervisor )(nil )
1516
17+ // creatingStateGCGrace is the minimum age for a "creating" record to be
18+ // considered stale by GC. This avoids racing with legitimate long-running
19+ // Create operations.
20+ const creatingStateGCGrace = 24 * time .Hour
21+
1622type chSnapshot struct {
1723 blobIDs map [string ]struct {} // union of all VMs' ImageBlobIDs
1824 vmIDs map [string ]struct {} // all VM IDs in the DB
19- staleCreate []string // IDs in "creating" state (crash remnants)
25+ staleCreate []string // IDs in stale "creating" state (crash remnants)
2026}
2127
2228func (s chSnapshot ) UsedBlobIDs () map [string ]struct {} { return s .blobIDs }
@@ -28,6 +34,7 @@ func (ch *CloudHypervisor) GCModule() gc.Module[chSnapshot] {
2834 Locker : ch .locker ,
2935 ReadDB : func (_ context.Context ) (chSnapshot , error ) {
3036 var snap chSnapshot
37+ cutoff := time .Now ().Add (- creatingStateGCGrace )
3138 if err := ch .store .Read (func (idx * hypervisor.VMIndex ) error {
3239 snap .blobIDs = make (map [string ]struct {})
3340 snap .vmIDs = make (map [string ]struct {})
@@ -39,7 +46,7 @@ func (ch *CloudHypervisor) GCModule() gc.Module[chSnapshot] {
3946 for hex := range rec .ImageBlobIDs {
4047 snap .blobIDs [hex ] = struct {}{}
4148 }
42- if rec .State == types .VMStateCreating {
49+ if rec .State == types .VMStateCreating && rec . UpdatedAt . Before ( cutoff ) {
4350 snap .staleCreate = append (snap .staleCreate , id )
4451 }
4552 }
@@ -50,10 +57,19 @@ func (ch *CloudHypervisor) GCModule() gc.Module[chSnapshot] {
5057 return snap , nil
5158 },
5259 Resolve : func (snap chSnapshot , _ map [string ]any ) []string {
53- // Orphan directories not in the DB.
54- orphans := utils .FilterUnreferenced (utils .ScanSubdirs (ch .conf .CHRunDir ()), snap .vmIDs )
55- // Stale "creating" records from interrupted Create calls.
56- return append (orphans , snap .staleCreate ... )
60+ runOrphans := utils .FilterUnreferenced (utils .ScanSubdirs (ch .conf .CHRunDir ()), snap .vmIDs )
61+ logOrphans := utils .FilterUnreferenced (utils .ScanSubdirs (ch .conf .CHLogDir ()), snap .vmIDs )
62+ candidates := append (append (runOrphans , logOrphans ... ), snap .staleCreate ... )
63+ seen := make (map [string ]struct {}, len (candidates ))
64+ var result []string
65+ for _ , id := range candidates {
66+ if _ , ok := seen [id ]; ok {
67+ continue
68+ }
69+ seen [id ] = struct {}{}
70+ result = append (result , id )
71+ }
72+ return result
5773 },
5874 Collect : func (ctx context.Context , ids []string ) error {
5975 var errs []error
@@ -63,23 +79,37 @@ func (ch *CloudHypervisor) GCModule() gc.Module[chSnapshot] {
6379 errs = append (errs , err )
6480 }
6581 }
66- // Clean up stale "creating" DB records.
67- if err := ch .cleanStalePlaceholders (ctx ); err != nil {
82+ // Clean up stale "creating" DB records from this GC snapshot .
83+ if err := ch .cleanStalePlaceholders (ctx , ids ); err != nil {
6884 errs = append (errs , err )
6985 }
7086 return errors .Join (errs ... )
7187 },
7288 }
7389}
7490
75- // cleanStalePlaceholders removes DB records stuck in "creating" state.
76- func (ch * CloudHypervisor ) cleanStalePlaceholders (_ context.Context ) error {
91+ // cleanStalePlaceholders removes selected DB records stuck in stale "creating"
92+ // state. IDs not found (or no longer stale) are skipped.
93+ func (ch * CloudHypervisor ) cleanStalePlaceholders (_ context.Context , ids []string ) error {
94+ if len (ids ) == 0 {
95+ return nil
96+ }
97+ targets := make (map [string ]struct {}, len (ids ))
98+ for _ , id := range ids {
99+ targets [id ] = struct {}{}
100+ }
101+ cutoff := time .Now ().Add (- creatingStateGCGrace )
77102 return ch .store .Write (func (idx * hypervisor.VMIndex ) error {
78- for id , rec := range idx .VMs {
79- if rec != nil && rec .State == types .VMStateCreating {
80- delete (idx .Names , rec .Config .Name )
81- delete (idx .VMs , id )
103+ for id := range targets {
104+ rec := idx .VMs [id ]
105+ if rec == nil {
106+ continue
107+ }
108+ if rec .State != types .VMStateCreating || rec .UpdatedAt .After (cutoff ) {
109+ continue
82110 }
111+ delete (idx .Names , rec .Config .Name )
112+ delete (idx .VMs , id )
83113 }
84114 return nil
85115 })
0 commit comments