Improve Failover Restore / Replication Handling (#1670)

kevinmichaelbowersox · Kevin Bowersox · Copilot · web-flow · commit 45c61bba0651 · 2026-04-07T15:17:15.000-07:00
* Improve Failover Restore / Replication Handling

* Update libs/common/ExceptionInjectionType.cs

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Apply PR Feedback

---------

Co-authored-by: Kevin Bowersox &lt;kbowersox@microsoft.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
Co-authored-by: Vasileios Zois &lt;96085550+vazois@users.noreply.github.com&gt;
diff --git a/libs/cluster/Server/Failover/FailoverManager.cs b/libs/cluster/Server/Failover/FailoverManager.cs
@@ -16,7 +16,7 @@ internal sealed class FailoverManager : IDisposable
         readonly TimeSpan clusterTimeout;
         readonly ILogger logger;
         private SingleWriterMultiReaderLock failoverTaskLock;
-        public FailoverStatus lastFailoverStatus = FailoverStatus.NO_FAILOVER;
+        public volatile FailoverStatus lastFailoverStatus = FailoverStatus.NO_FAILOVER;
 
         /// <summary>
         /// Shared epoch instance for failover GarnetClient connections
diff --git a/libs/cluster/Server/Failover/ReplicaFailoverSession.cs b/libs/cluster/Server/Failover/ReplicaFailoverSession.cs
@@ -119,6 +119,11 @@ private bool TakeOverAsPrimary()
 
             try
             {
+#if DEBUG
+                // Exception injection point for testing: simulates TakeOverAsPrimary failure
+                // after PauseWritesAndWaitForSync has already sent failstopwrites to the primary.
+                ExceptionInjectionHelper.TriggerException(ExceptionInjectionType.Failover_Fail_TakeOverAsPrimary);
+#endif
                 // Make replica syncing unavailable by setting recovery flag
                 if (!clusterProvider.replicationManager.BeginRecovery(RecoveryStatus.ClusterFailover, upgradeLock: false))
                 {
@@ -272,6 +277,14 @@ private async Task IssueAttachReplicas()
             }
         }
 
+        /// <summary>
+        /// Returns true if failstopwrites was confirmed by the primary and the primary's
+        /// config was modified (slots given up, role changed to replica). Used to determine
+        /// whether the primary needs to be reset on failover failure.
+        /// </summary>
+        private bool PrimaryNeedsReset()
+            => status is FailoverStatus.WAITING_FOR_SYNC or FailoverStatus.TAKING_OVER_AS_PRIMARY;
+
         /// <summary>
         /// REPLICA main failover task
         /// </summary>
@@ -281,6 +294,7 @@ public async Task<bool> BeginAsyncReplicaFailover()
             // CLUSTER FAILOVER OPTIONS
             // FORCE: Do not await for the primary since it might be unreachable
             // TAKEOVER: Same as force but also do not await for voting from other primaries
+            var failoverSucceeded = false;
             try
             {
                 // Issue stop writes and on ack wait for replica to catch up
@@ -298,11 +312,9 @@ public async Task<bool> BeginAsyncReplicaFailover()
                 // Transition to primary role
                 if (!TakeOverAsPrimary())
                 {
-                    // Request primary to be reset to original state only if DEFAULT option was used
-                    if (primaryClient != null)
-                        _ = await primaryClient?.failstopwrites(Array.Empty<byte>()).WaitAsync(failoverTimeout, cts.Token);
                     return false;
                 }
+                failoverSucceeded = true;
 
                 // Attach to old replicas, and old primary if DEFAULT option
                 await IssueAttachReplicas();
@@ -319,6 +331,24 @@ public async Task<bool> BeginAsyncReplicaFailover()
             }
             finally
             {
+                // If failstopwrites was confirmed by the primary (status reached WAITING_FOR_SYNC
+                // or beyond) but the failover did not succeed, reset the primary back to its
+                // original state. Without this, the primary has already given up its slots
+                // (via TryStopWrites) but the replica never claimed them, leaving the cluster
+                // in an incoherent state where no node owns the slots.
+                if (PrimaryNeedsReset() && !failoverSucceeded)
+                {
+                    try
+                    {
+                        logger?.LogWarning("Attempting to reset primary after failed failover");
+                        _ = await primaryClient?.failstopwrites(Array.Empty<byte>()).WaitAsync(failoverTimeout, cts.Token);
+                    }
+                    catch (Exception ex)
+                    {
+                        logger?.LogError(ex, "Failed to reset primary after failed failover — cluster may be in an incoherent state");
+                    }
+                }
+
                 primaryClient?.Dispose();
                 status = FailoverStatus.NO_FAILOVER;
             }
diff --git a/libs/cluster/Server/Replication/ReplicationManager.cs b/libs/cluster/Server/Replication/ReplicationManager.cs
@@ -213,6 +213,20 @@ public void EnsureReplication(ClusterSession activeSession, IEnumerable<ICluster
                 return;
             }
 
+            // Suppress auto-resync while a failover is in progress.
+            // Without this guard, EnsureReplication would acquire a ReadRole lock that blocks
+            // TakeOverAsPrimary from obtaining its ClusterFailover write lock, aborting the failover.
+            var failoverStatus = clusterProvider.failoverManager.lastFailoverStatus;
+            if (failoverStatus is FailoverStatus.BEGIN_FAILOVER
+                              or FailoverStatus.ISSUING_PAUSE_WRITES
+                              or FailoverStatus.WAITING_FOR_SYNC
+                              or FailoverStatus.FAILOVER_IN_PROGRESS
+                              or FailoverStatus.TAKING_OVER_AS_PRIMARY)
+            {
+                logger?.LogDebug("Suppressing auto-resync during active failover (status: {failoverStatus})", failoverStatus);
+                return;
+            }
+
             // Now we're going to attempt to re-establish replication
 
             // To avoid a TOCTOU issue, we need to prevent role change while we do this
diff --git a/libs/common/ExceptionInjectionType.cs b/libs/common/ExceptionInjectionType.cs
@@ -77,5 +77,9 @@ public enum ExceptionInjectionType
         /// During deletion of a Vector Set, leaving it partially deleted - at a particular point of execution.
         /// </summary>
         VectorSet_Interrupt_Delete_2,
+        /// <summary>
+        /// Fail TakeOverAsPrimary during failover by throwing before BeginRecovery is called.
+        /// </summary>
+        Failover_Fail_TakeOverAsPrimary,
     }
 }
diff --git a/test/Garnet.test.cluster/ClusterNegativeTests.cs b/test/Garnet.test.cluster/ClusterNegativeTests.cs
@@ -626,5 +626,216 @@ public void ClusterReplicateFails()
             var exc = Assert.Throws<RedisServerException>(() => replicaServer.Execute("CLUSTER", ["REPLICATE", Guid.NewGuid().ToString()], flags: CommandFlags.NoRedirect));
             ClassicAssert.IsTrue(exc.Message.StartsWith("ERR I don't know about node "));
         }
+        [Test, Order(14), CancelAfter(testTimeout)]
+        [Category("REPLICATION")]
+        public void ClusterFailoverSucceedsDuringEnsureReplication(CancellationToken cancellationToken)
+        {
+            // Verify that EnsureReplication does not block an in-flight failover.
+            // EnsureReplication polls for dropped replication sessions and attempts auto-resync.
+            // Without the failover status guard, it could acquire a ReadRole lock that blocks
+            // TakeOverAsPrimary from obtaining its ClusterFailover write lock, aborting the failover.
+            var primaryIndex = 0;
+            var replicaIndex = 1;
+            var nodes_count = 2;
+
+            // Enable EnsureReplication by setting clusterReplicationReestablishmentTimeout to 1 second
+            context.CreateInstances(nodes_count, disableObjects: true, enableAOF: true,
+                timeout: timeout, clusterReplicationReestablishmentTimeout: 1);
+            context.CreateConnection();
+
+            _ = context.clusterTestUtils.AddDelSlotsRange(primaryIndex, [(0, 16383)], addslot: true, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(primaryIndex, primaryIndex + 1, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(replicaIndex, replicaIndex + 1, logger: context.logger);
+            context.clusterTestUtils.Meet(primaryIndex, replicaIndex, logger: context.logger);
+            context.clusterTestUtils.WaitUntilNodeIsKnown(primaryIndex, replicaIndex, logger: context.logger);
+
+            // Set up replication
+            var resp = context.clusterTestUtils.ClusterReplicate(replicaNodeIndex: replicaIndex, primaryNodeIndex: primaryIndex, logger: context.logger);
+            ClassicAssert.AreEqual("OK", resp);
+            context.clusterTestUtils.WaitForReplicaRecovery(replicaIndex, context.logger);
+            context.clusterTestUtils.WaitForConnectedReplicaCount(primaryIndex, 1, context.logger);
+
+            // Populate primary and wait for sync
+            context.kvPairs = [];
+            context.PopulatePrimary(ref context.kvPairs, keyLength: 32, kvpairCount: 16, primaryIndex);
+            context.clusterTestUtils.WaitForReplicaAofSync(primaryIndex, replicaIndex, context.logger);
+
+            // Issue failover — with EnsureReplication enabled (polling every 1s),
+            // the guard should prevent it from interfering with the failover
+            resp = context.clusterTestUtils.ClusterFailover(replicaIndex, logger: context.logger);
+            ClassicAssert.AreEqual("OK", resp);
+
+            // Wait for failover to complete — without the guard this could hang or abort
+            context.clusterTestUtils.WaitForFailoverCompleted(replicaIndex, context.logger);
+
+            // The old primary should become a replica
+            context.clusterTestUtils.WaitForReplicaRecovery(primaryIndex, context.logger);
+
+            // Verify the new primary (formerly replica) is functional
+            var role = context.clusterTestUtils.RoleCommand(replicaIndex, context.logger);
+            ClassicAssert.AreEqual("master", role.Value);
+
+            var oldPrimaryRole = context.clusterTestUtils.RoleCommand(primaryIndex, context.logger);
+            ClassicAssert.AreEqual("slave", oldPrimaryRole.Value);
+
+            // Verify last failover state
+            var infoItem = context.clusterTestUtils.GetReplicationInfo(replicaIndex,
+                [ReplicationInfoItem.LAST_FAILOVER_STATE], logger: context.logger);
+            ClassicAssert.AreEqual("failover-completed", infoItem[0].Item2);
+        }
+
+        [Test, Order(15), CancelAfter(testTimeout)]
+        [Category("REPLICATION")]
+        public void ClusterEnsureReplicationWorksAfterFailover(CancellationToken cancellationToken)
+        {
+            // Verify that EnsureReplication still functions after a failover completes.
+            // The failover guard should only suppress auto-resync during active failover states,
+            // not after failover has completed or aborted.
+            var primaryIndex = 0;
+            var replicaIndex = 1;
+            var nodes_count = 2;
+
+            // Enable EnsureReplication with a 1-second poll frequency
+            context.CreateInstances(nodes_count, disableObjects: true, enableAOF: true,
+                timeout: timeout, clusterReplicationReestablishmentTimeout: 1);
+            context.CreateConnection();
+
+            _ = context.clusterTestUtils.AddDelSlotsRange(primaryIndex, [(0, 16383)], addslot: true, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(primaryIndex, primaryIndex + 1, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(replicaIndex, replicaIndex + 1, logger: context.logger);
+            context.clusterTestUtils.Meet(primaryIndex, replicaIndex, logger: context.logger);
+            context.clusterTestUtils.WaitUntilNodeIsKnown(primaryIndex, replicaIndex, logger: context.logger);
+
+            // Set up replication
+            var resp = context.clusterTestUtils.ClusterReplicate(replicaNodeIndex: replicaIndex, primaryNodeIndex: primaryIndex, logger: context.logger);
+            ClassicAssert.AreEqual("OK", resp);
+            context.clusterTestUtils.WaitForReplicaRecovery(replicaIndex, context.logger);
+            context.clusterTestUtils.WaitForConnectedReplicaCount(primaryIndex, 1, context.logger);
+
+            // Populate primary data and sync
+            context.kvPairs = [];
+            context.PopulatePrimary(ref context.kvPairs, keyLength: 32, kvpairCount: 16, primaryIndex);
+            context.clusterTestUtils.WaitForReplicaAofSync(primaryIndex, replicaIndex, context.logger);
+
+            // Run failover
+            resp = context.clusterTestUtils.ClusterFailover(replicaIndex, logger: context.logger);
+            ClassicAssert.AreEqual("OK", resp);
+            context.clusterTestUtils.WaitForFailoverCompleted(replicaIndex, context.logger);
+
+            // Old primary should now be a replica of the new primary
+            context.clusterTestUtils.WaitForReplicaRecovery(primaryIndex, context.logger);
+
+            // Verify last_failover_state is "failover-completed" — this should NOT suppress EnsureReplication
+            var infoItem = context.clusterTestUtils.GetReplicationInfo(replicaIndex,
+                [ReplicationInfoItem.LAST_FAILOVER_STATE], logger: context.logger);
+            ClassicAssert.AreEqual("failover-completed", infoItem[0].Item2);
+
+            // Verify replication is working in the new topology
+            // The old primary (index 0) is now a replica of the new primary (index 1)
+            // Write to new primary and verify it replicates to old primary (now replica)
+            var slotMap = new int[16384];
+            for (var i = 0; i < 16384; i++)
+                slotMap[i] = replicaIndex;
+
+            var newKvPairs = new Dictionary<string, int>();
+            context.PopulatePrimary(ref newKvPairs, keyLength: 32, kvpairCount: 8, replicaIndex, slotMap: slotMap);
+            context.clusterTestUtils.WaitForReplicaAofSync(replicaIndex, primaryIndex, context.logger);
+            context.ValidateKVCollectionAgainstReplica(ref newKvPairs, primaryIndex);
+        }
+
+#if DEBUG
+        [Test, Order(16), CancelAfter(testTimeout)]
+        [Category("REPLICATION")]
+        public void ClusterFailoverResetsPrimaryOnTakeOverFailure(CancellationToken cancellationToken)
+        {
+            // Verify that when TakeOverAsPrimary fails after PauseWritesAndWaitForSync
+            // has already sent failstopwrites to the primary, the primary is reset back
+            // to its original state (owns slots, is a primary).
+            //
+            // Without the reset in BeginAsyncReplicaFailover's finally block, the primary
+            // would have given up its slots (via TryStopWrites) but the replica never
+            // claimed them, leaving the cluster in an incoherent state.
+            var primaryIndex = 0;
+            var replicaIndex = 1;
+            var nodes_count = 2;
+
+            context.CreateInstances(nodes_count, disableObjects: true, enableAOF: true, timeout: timeout);
+            context.CreateConnection();
+
+            _ = context.clusterTestUtils.AddDelSlotsRange(primaryIndex, [(0, 16383)], addslot: true, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(primaryIndex, primaryIndex + 1, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(replicaIndex, replicaIndex + 1, logger: context.logger);
+            context.clusterTestUtils.Meet(primaryIndex, replicaIndex, logger: context.logger);
+            context.clusterTestUtils.WaitUntilNodeIsKnown(primaryIndex, replicaIndex, logger: context.logger);
+
+            // Set up replication
+            var resp = context.clusterTestUtils.ClusterReplicate(replicaNodeIndex: replicaIndex, primaryNodeIndex: primaryIndex, logger: context.logger);
+            ClassicAssert.AreEqual("OK", resp);
+            context.clusterTestUtils.WaitForReplicaRecovery(replicaIndex, context.logger);
+            context.clusterTestUtils.WaitForConnectedReplicaCount(primaryIndex, 1, context.logger);
+
+            // Populate data and sync
+            context.kvPairs = [];
+            context.PopulatePrimary(ref context.kvPairs, keyLength: 32, kvpairCount: 16, primaryIndex);
+            context.clusterTestUtils.WaitForReplicaAofSync(primaryIndex, replicaIndex, context.logger);
+
+            // Verify primary owns all slots before failover
+            var slotsBefore = context.clusterTestUtils.GetOwnedSlotsFromNode(primaryIndex, context.logger);
+            ClassicAssert.AreEqual(16384, slotsBefore.Count, "Primary should own all slots before failover");
+
+            try
+            {
+                // Enable exception injection to make TakeOverAsPrimary fail.
+                // This simulates the scenario where PauseWritesAndWaitForSync succeeds
+                // (failstopwrites sent to primary, primary gives up slots) but the
+                // subsequent TakeOverAsPrimary fails (e.g., due to lock contention from
+                // EnsureReplication holding the ReadRole lock).
+                ExceptionInjectionHelper.EnableException(ExceptionInjectionType.Failover_Fail_TakeOverAsPrimary);
+
+                // Issue DEFAULT failover — this will:
+                // 1. Send failstopwrites to primary (primary gives up slots)
+                // 2. Wait for sync
+                // 3. TakeOverAsPrimary — FAILS due to injected exception
+                // 4. finally block sends failstopwrites([]) to reset primary
+                resp = context.clusterTestUtils.ClusterFailover(replicaIndex, logger: context.logger);
+                ClassicAssert.AreEqual("OK", resp);
+
+                // Wait for failover to be aborted
+                while (true)
+                {
+                    var infoItem = context.clusterTestUtils.GetReplicationInfo(replicaIndex,
+                        [ReplicationInfoItem.LAST_FAILOVER_STATE], logger: context.logger);
+                    if (infoItem[0].Item2.Equals("failover-aborted"))
+                        break;
+                    ClusterTestUtils.BackOff(cancellationToken: cancellationToken, msg: "Waiting for failover to abort");
+                }
+            }
+            finally
+            {
+                ExceptionInjectionHelper.DisableException(ExceptionInjectionType.Failover_Fail_TakeOverAsPrimary);
+            }
+
+            // Verify primary has been reset: it should own all slots again
+            // Without the reset fix, the primary would have 0 slots here.
+            while (true)
+            {
+                var slotsAfter = context.clusterTestUtils.GetOwnedSlotsFromNode(primaryIndex, context.logger);
+                if (slotsAfter.Count == 16384)
+                    break;
+                ClusterTestUtils.BackOff(cancellationToken: cancellationToken, msg: $"Waiting for primary to reclaim slots (current: {slotsAfter.Count})");
+            }
+
+            // Verify primary is still a primary
+            var role = context.clusterTestUtils.RoleCommand(primaryIndex, logger: context.logger);
+            ClassicAssert.AreEqual("master", role.Value, "Primary should be reset back to master role");
+
+            // Verify replica is still a replica (failover was aborted)
+            role = context.clusterTestUtils.RoleCommand(replicaIndex, logger: context.logger);
+            ClassicAssert.AreEqual("slave", role.Value, "Replica should remain a slave after aborted failover");
+
+            // Verify data is still accessible from the primary
+            context.ValidateKVCollectionAgainstReplica(ref context.kvPairs, primaryIndex);
+        }
+#endif
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -77,5 +77,9 @@ public enum ExceptionInjectionType`
`77`	`77`	`/// During deletion of a Vector Set, leaving it partially deleted - at a particular point of execution.`
`78`	`78`	`/// </summary>`
`79`	`79`	`VectorSet_Interrupt_Delete_2,`
	`80`	`+ /// <summary>`
	`81`	`+ /// Fail TakeOverAsPrimary during failover by throwing before BeginRecovery is called.`
	`82`	`+ /// </summary>`
	`83`	`+ Failover_Fail_TakeOverAsPrimary,`
`80`	`84`	`}`
`81`	`85`	`}`