@@ -626,5 +626,216 @@ public void ClusterReplicateFails()
626626 var exc = Assert . Throws < RedisServerException > ( ( ) => replicaServer . Execute ( "CLUSTER" , [ "REPLICATE" , Guid . NewGuid ( ) . ToString ( ) ] , flags : CommandFlags . NoRedirect ) ) ;
627627 ClassicAssert . IsTrue ( exc . Message . StartsWith ( "ERR I don't know about node " ) ) ;
628628 }
629+ [ Test , Order ( 14 ) , CancelAfter ( testTimeout ) ]
630+ [ Category ( "REPLICATION" ) ]
631+ public void ClusterFailoverSucceedsDuringEnsureReplication ( CancellationToken cancellationToken )
632+ {
633+ // Verify that EnsureReplication does not block an in-flight failover.
634+ // EnsureReplication polls for dropped replication sessions and attempts auto-resync.
635+ // Without the failover status guard, it could acquire a ReadRole lock that blocks
636+ // TakeOverAsPrimary from obtaining its ClusterFailover write lock, aborting the failover.
637+ var primaryIndex = 0 ;
638+ var replicaIndex = 1 ;
639+ var nodes_count = 2 ;
640+
641+ // Enable EnsureReplication by setting clusterReplicationReestablishmentTimeout to 1 second
642+ context . CreateInstances ( nodes_count , disableObjects : true , enableAOF : true ,
643+ timeout : timeout , clusterReplicationReestablishmentTimeout : 1 ) ;
644+ context . CreateConnection ( ) ;
645+
646+ _ = context . clusterTestUtils . AddDelSlotsRange ( primaryIndex , [ ( 0 , 16383 ) ] , addslot : true , logger : context . logger ) ;
647+ context . clusterTestUtils . SetConfigEpoch ( primaryIndex , primaryIndex + 1 , logger : context . logger ) ;
648+ context . clusterTestUtils . SetConfigEpoch ( replicaIndex , replicaIndex + 1 , logger : context . logger ) ;
649+ context . clusterTestUtils . Meet ( primaryIndex , replicaIndex , logger : context . logger ) ;
650+ context . clusterTestUtils . WaitUntilNodeIsKnown ( primaryIndex , replicaIndex , logger : context . logger ) ;
651+
652+ // Set up replication
653+ var resp = context . clusterTestUtils . ClusterReplicate ( replicaNodeIndex : replicaIndex , primaryNodeIndex : primaryIndex , logger : context . logger ) ;
654+ ClassicAssert . AreEqual ( "OK" , resp ) ;
655+ context . clusterTestUtils . WaitForReplicaRecovery ( replicaIndex , context . logger ) ;
656+ context . clusterTestUtils . WaitForConnectedReplicaCount ( primaryIndex , 1 , context . logger ) ;
657+
658+ // Populate primary and wait for sync
659+ context . kvPairs = [ ] ;
660+ context . PopulatePrimary ( ref context . kvPairs , keyLength : 32 , kvpairCount : 16 , primaryIndex ) ;
661+ context . clusterTestUtils . WaitForReplicaAofSync ( primaryIndex , replicaIndex , context . logger ) ;
662+
663+ // Issue failover — with EnsureReplication enabled (polling every 1s),
664+ // the guard should prevent it from interfering with the failover
665+ resp = context . clusterTestUtils . ClusterFailover ( replicaIndex , logger : context . logger ) ;
666+ ClassicAssert . AreEqual ( "OK" , resp ) ;
667+
668+ // Wait for failover to complete — without the guard this could hang or abort
669+ context . clusterTestUtils . WaitForFailoverCompleted ( replicaIndex , context . logger ) ;
670+
671+ // The old primary should become a replica
672+ context . clusterTestUtils . WaitForReplicaRecovery ( primaryIndex , context . logger ) ;
673+
674+ // Verify the new primary (formerly replica) is functional
675+ var role = context . clusterTestUtils . RoleCommand ( replicaIndex , context . logger ) ;
676+ ClassicAssert . AreEqual ( "master" , role . Value ) ;
677+
678+ var oldPrimaryRole = context . clusterTestUtils . RoleCommand ( primaryIndex , context . logger ) ;
679+ ClassicAssert . AreEqual ( "slave" , oldPrimaryRole . Value ) ;
680+
681+ // Verify last failover state
682+ var infoItem = context . clusterTestUtils . GetReplicationInfo ( replicaIndex ,
683+ [ ReplicationInfoItem . LAST_FAILOVER_STATE ] , logger : context . logger ) ;
684+ ClassicAssert . AreEqual ( "failover-completed" , infoItem [ 0 ] . Item2 ) ;
685+ }
686+
687+ [ Test , Order ( 15 ) , CancelAfter ( testTimeout ) ]
688+ [ Category ( "REPLICATION" ) ]
689+ public void ClusterEnsureReplicationWorksAfterFailover ( CancellationToken cancellationToken )
690+ {
691+ // Verify that EnsureReplication still functions after a failover completes.
692+ // The failover guard should only suppress auto-resync during active failover states,
693+ // not after failover has completed or aborted.
694+ var primaryIndex = 0 ;
695+ var replicaIndex = 1 ;
696+ var nodes_count = 2 ;
697+
698+ // Enable EnsureReplication with a 1-second poll frequency
699+ context . CreateInstances ( nodes_count , disableObjects : true , enableAOF : true ,
700+ timeout : timeout , clusterReplicationReestablishmentTimeout : 1 ) ;
701+ context . CreateConnection ( ) ;
702+
703+ _ = context . clusterTestUtils . AddDelSlotsRange ( primaryIndex , [ ( 0 , 16383 ) ] , addslot : true , logger : context . logger ) ;
704+ context . clusterTestUtils . SetConfigEpoch ( primaryIndex , primaryIndex + 1 , logger : context . logger ) ;
705+ context . clusterTestUtils . SetConfigEpoch ( replicaIndex , replicaIndex + 1 , logger : context . logger ) ;
706+ context . clusterTestUtils . Meet ( primaryIndex , replicaIndex , logger : context . logger ) ;
707+ context . clusterTestUtils . WaitUntilNodeIsKnown ( primaryIndex , replicaIndex , logger : context . logger ) ;
708+
709+ // Set up replication
710+ var resp = context . clusterTestUtils . ClusterReplicate ( replicaNodeIndex : replicaIndex , primaryNodeIndex : primaryIndex , logger : context . logger ) ;
711+ ClassicAssert . AreEqual ( "OK" , resp ) ;
712+ context . clusterTestUtils . WaitForReplicaRecovery ( replicaIndex , context . logger ) ;
713+ context . clusterTestUtils . WaitForConnectedReplicaCount ( primaryIndex , 1 , context . logger ) ;
714+
715+ // Populate primary data and sync
716+ context . kvPairs = [ ] ;
717+ context . PopulatePrimary ( ref context . kvPairs , keyLength : 32 , kvpairCount : 16 , primaryIndex ) ;
718+ context . clusterTestUtils . WaitForReplicaAofSync ( primaryIndex , replicaIndex , context . logger ) ;
719+
720+ // Run failover
721+ resp = context . clusterTestUtils . ClusterFailover ( replicaIndex , logger : context . logger ) ;
722+ ClassicAssert . AreEqual ( "OK" , resp ) ;
723+ context . clusterTestUtils . WaitForFailoverCompleted ( replicaIndex , context . logger ) ;
724+
725+ // Old primary should now be a replica of the new primary
726+ context . clusterTestUtils . WaitForReplicaRecovery ( primaryIndex , context . logger ) ;
727+
728+ // Verify last_failover_state is "failover-completed" — this should NOT suppress EnsureReplication
729+ var infoItem = context . clusterTestUtils . GetReplicationInfo ( replicaIndex ,
730+ [ ReplicationInfoItem . LAST_FAILOVER_STATE ] , logger : context . logger ) ;
731+ ClassicAssert . AreEqual ( "failover-completed" , infoItem [ 0 ] . Item2 ) ;
732+
733+ // Verify replication is working in the new topology
734+ // The old primary (index 0) is now a replica of the new primary (index 1)
735+ // Write to new primary and verify it replicates to old primary (now replica)
736+ var slotMap = new int [ 16384 ] ;
737+ for ( var i = 0 ; i < 16384 ; i ++ )
738+ slotMap [ i ] = replicaIndex ;
739+
740+ var newKvPairs = new Dictionary < string , int > ( ) ;
741+ context . PopulatePrimary ( ref newKvPairs , keyLength : 32 , kvpairCount : 8 , replicaIndex , slotMap : slotMap ) ;
742+ context . clusterTestUtils . WaitForReplicaAofSync ( replicaIndex , primaryIndex , context . logger ) ;
743+ context . ValidateKVCollectionAgainstReplica ( ref newKvPairs , primaryIndex ) ;
744+ }
745+
746+ #if DEBUG
747+ [ Test , Order ( 16 ) , CancelAfter ( testTimeout ) ]
748+ [ Category ( "REPLICATION" ) ]
749+ public void ClusterFailoverResetsPrimaryOnTakeOverFailure ( CancellationToken cancellationToken )
750+ {
751+ // Verify that when TakeOverAsPrimary fails after PauseWritesAndWaitForSync
752+ // has already sent failstopwrites to the primary, the primary is reset back
753+ // to its original state (owns slots, is a primary).
754+ //
755+ // Without the reset in BeginAsyncReplicaFailover's finally block, the primary
756+ // would have given up its slots (via TryStopWrites) but the replica never
757+ // claimed them, leaving the cluster in an incoherent state.
758+ var primaryIndex = 0 ;
759+ var replicaIndex = 1 ;
760+ var nodes_count = 2 ;
761+
762+ context . CreateInstances ( nodes_count , disableObjects : true , enableAOF : true , timeout : timeout ) ;
763+ context . CreateConnection ( ) ;
764+
765+ _ = context . clusterTestUtils . AddDelSlotsRange ( primaryIndex , [ ( 0 , 16383 ) ] , addslot : true , logger : context . logger ) ;
766+ context . clusterTestUtils . SetConfigEpoch ( primaryIndex , primaryIndex + 1 , logger : context . logger ) ;
767+ context . clusterTestUtils . SetConfigEpoch ( replicaIndex , replicaIndex + 1 , logger : context . logger ) ;
768+ context . clusterTestUtils . Meet ( primaryIndex , replicaIndex , logger : context . logger ) ;
769+ context . clusterTestUtils . WaitUntilNodeIsKnown ( primaryIndex , replicaIndex , logger : context . logger ) ;
770+
771+ // Set up replication
772+ var resp = context . clusterTestUtils . ClusterReplicate ( replicaNodeIndex : replicaIndex , primaryNodeIndex : primaryIndex , logger : context . logger ) ;
773+ ClassicAssert . AreEqual ( "OK" , resp ) ;
774+ context . clusterTestUtils . WaitForReplicaRecovery ( replicaIndex , context . logger ) ;
775+ context . clusterTestUtils . WaitForConnectedReplicaCount ( primaryIndex , 1 , context . logger ) ;
776+
777+ // Populate data and sync
778+ context . kvPairs = [ ] ;
779+ context . PopulatePrimary ( ref context . kvPairs , keyLength : 32 , kvpairCount : 16 , primaryIndex ) ;
780+ context . clusterTestUtils . WaitForReplicaAofSync ( primaryIndex , replicaIndex , context . logger ) ;
781+
782+ // Verify primary owns all slots before failover
783+ var slotsBefore = context . clusterTestUtils . GetOwnedSlotsFromNode ( primaryIndex , context . logger ) ;
784+ ClassicAssert . AreEqual ( 16384 , slotsBefore . Count , "Primary should own all slots before failover" ) ;
785+
786+ try
787+ {
788+ // Enable exception injection to make TakeOverAsPrimary fail.
789+ // This simulates the scenario where PauseWritesAndWaitForSync succeeds
790+ // (failstopwrites sent to primary, primary gives up slots) but the
791+ // subsequent TakeOverAsPrimary fails (e.g., due to lock contention from
792+ // EnsureReplication holding the ReadRole lock).
793+ ExceptionInjectionHelper . EnableException ( ExceptionInjectionType . Failover_Fail_TakeOverAsPrimary ) ;
794+
795+ // Issue DEFAULT failover — this will:
796+ // 1. Send failstopwrites to primary (primary gives up slots)
797+ // 2. Wait for sync
798+ // 3. TakeOverAsPrimary — FAILS due to injected exception
799+ // 4. finally block sends failstopwrites([]) to reset primary
800+ resp = context . clusterTestUtils . ClusterFailover ( replicaIndex , logger : context . logger ) ;
801+ ClassicAssert . AreEqual ( "OK" , resp ) ;
802+
803+ // Wait for failover to be aborted
804+ while ( true )
805+ {
806+ var infoItem = context . clusterTestUtils . GetReplicationInfo ( replicaIndex ,
807+ [ ReplicationInfoItem . LAST_FAILOVER_STATE ] , logger : context . logger ) ;
808+ if ( infoItem [ 0 ] . Item2 . Equals ( "failover-aborted" ) )
809+ break ;
810+ ClusterTestUtils . BackOff ( cancellationToken : cancellationToken , msg : "Waiting for failover to abort" ) ;
811+ }
812+ }
813+ finally
814+ {
815+ ExceptionInjectionHelper . DisableException ( ExceptionInjectionType . Failover_Fail_TakeOverAsPrimary ) ;
816+ }
817+
818+ // Verify primary has been reset: it should own all slots again
819+ // Without the reset fix, the primary would have 0 slots here.
820+ while ( true )
821+ {
822+ var slotsAfter = context . clusterTestUtils . GetOwnedSlotsFromNode ( primaryIndex , context . logger ) ;
823+ if ( slotsAfter . Count == 16384 )
824+ break ;
825+ ClusterTestUtils . BackOff ( cancellationToken : cancellationToken , msg : $ "Waiting for primary to reclaim slots (current: { slotsAfter . Count } )") ;
826+ }
827+
828+ // Verify primary is still a primary
829+ var role = context . clusterTestUtils . RoleCommand ( primaryIndex , logger : context . logger ) ;
830+ ClassicAssert . AreEqual ( "master" , role . Value , "Primary should be reset back to master role" ) ;
831+
832+ // Verify replica is still a replica (failover was aborted)
833+ role = context . clusterTestUtils . RoleCommand ( replicaIndex , logger : context . logger ) ;
834+ ClassicAssert . AreEqual ( "slave" , role . Value , "Replica should remain a slave after aborted failover" ) ;
835+
836+ // Verify data is still accessible from the primary
837+ context . ValidateKVCollectionAgainstReplica ( ref context . kvPairs , primaryIndex ) ;
838+ }
839+ #endif
629840 }
630841}
0 commit comments