feat(archon): add ZBVZeroBubble pipeline schedule support

rchardx · rchardx · commit a39ea69d0888 · 2026-02-10T11:44:40.000+08:00
Add V-style stage assignment for ZBVZeroBubble, where rank 0 holds
both the first and last stages for near-zero pipeline bubbles.

ZBV's split backward (I/W separation) is incompatible with
torch.compile, op-level selective AC, and memory_budget AC.
These are auto-detected and handled: compile is disabled,
incompatible AC modes fall back to full AC.

Key changes:
- Add "ZBVZeroBubble" to pp_schedule config choices
- Add V-style stage assignment in _get_stage_indices()
- Move _pp_last_stage_rank setup into _apply_pipeline_parallelism()
- Auto-disable torch.compile and incompatible AC modes for ZBV
- Add ZBV FQN generation tests and distributed test entries
- Add torchrun ZBV forward/backward test support
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -453,7 +453,7 @@ class ArchonEngineConfig:
         default="Interleaved1F1B",
         metadata={
             "help": "Pipeline parallel schedule type.",
-            "choices": ["1F1B", "Interleaved1F1B"],
+            "choices": ["1F1B", "Interleaved1F1B", "ZBVZeroBubble"],
         },
     )
     # NOTE: The following three PP layer distribution parameters are advanced options
@@ -466,7 +466,7 @@ class ArchonEngineConfig:
             "help": "Number of transformer layers per (virtual) pipeline stage. "
             "If set, num_virtual_stages is calculated from num_layers. "
             "If None, stages are inferred from schedule type "
-            "(1 stage/rank for 1F1B, 2 stages/rank for Interleaved1F1B).",
+            "(1 stage/rank for 1F1B, 2 stages/rank for Interleaved1F1B/ZBVZeroBubble).",
         },
     )
     pp_first_stage_less_layers: int = field(
diff --git a/areal/experimental/engine/archon_engine.py b/areal/experimental/engine/archon_engine.py
@@ -12,6 +12,10 @@
 import torch
 import torch.distributed as dist
 from torch import nn
+from torch.distributed.pipelining.schedules import (
+    ScheduleZBVZeroBubble,
+    get_schedule_class,
+)
 from transformers import (
     AutoConfig,
     PretrainedConfig,
@@ -211,9 +215,9 @@ def create_process_group(
 
         # Pipeline parallel rank
         if self.parallel_dims.pp_enabled:
-            pp_group = self.parallel_dims.get_group("pp")
             self._pp_rank = self.parallel_dims.get_mesh("pp").get_local_rank()
-            self._pp_last_stage_rank = dist.get_process_group_ranks(pp_group)[-1]
+            # Set in _apply_pipeline_parallelism() after pipeline setup
+            self._pp_last_stage_rank = None
         else:
             self._pp_rank = 0
             self._pp_last_stage_rank = None
@@ -269,6 +273,36 @@ def initialize(self, addr: str | None, ft_spec: FinetuneSpec, *args, **kwargs):
         ac_config = self._build_ac_config()
         enable_compile = self.config.archon.enable_compile
 
+        # ZBVZeroBubble splits backward into I (input grad) and W (weight grad)
+        # steps. This is incompatible with:
+        # 1. torch.compile — its donated buffer optimization assumes a single
+        #    backward pass (retain_graph=False).
+        # 2. Op-level selective AC — its per-op cache (storage.pop) is consumed
+        #    by the I step, leaving nothing for the W step recompute.
+        # 3. memory_budget AC — it depends on torch.compile.
+        # Full AC / layer-level selective AC use standard checkpoint_wrapper
+        # whose gid-based recompute supports multiple backward passes.
+        if self.config.archon.pp_schedule == "ZBVZeroBubble":
+            if enable_compile:
+                self.logger.warning(
+                    "ZBVZeroBubble is incompatible with torch.compile. "
+                    "Disabling torch.compile."
+                )
+                enable_compile = False
+
+            if ac_config is not None and (
+                (
+                    ac_config.mode == "selective"
+                    and ac_config.selective_ac_option == "op"
+                )
+                or ac_config.mode == "memory_budget"
+            ):
+                self.logger.warning(
+                    f"ZBVZeroBubble is incompatible with {ac_config.mode} AC. "
+                    "Falling back to full AC."
+                )
+                ac_config.mode = "full"
+
         # Force pad_to_maximum when compile is enabled to avoid dynamic shape issues
         if enable_compile and not self.config.pad_to_maximum:
             self.logger.info(
@@ -790,6 +824,17 @@ def _apply_pipeline_parallelism(
         # Delete original model to free memory
         del self.model
 
+        # Determine which rank holds the last pipeline stage
+        pp_group = self.parallel_dims.get_group("pp")
+        pp_ranks = dist.get_process_group_ranks(pp_group)
+        schedule_class = get_schedule_class(self.config.archon.pp_schedule)
+        if schedule_class is ScheduleZBVZeroBubble:
+            # V-style: rank 0 holds stages (0, num_stages-1)
+            self._pp_last_stage_rank = pp_ranks[0]
+        else:
+            # Loop-style: last rank has last stage
+            self._pp_last_stage_rank = pp_ranks[-1]
+
         self.logger.info(
             f"PP enabled: has_first={self.pp_has_first_stage}, "
             f"has_last={self.pp_has_last_stage}"
diff --git a/areal/experimental/models/archon/pipeline_parallel.py b/areal/experimental/models/archon/pipeline_parallel.py
@@ -14,6 +14,8 @@
 from torch.distributed.pipelining.schedules import (
     PipelineScheduleMulti,
     PipelineScheduleSingle,
+    ScheduleDualPipeV,
+    ScheduleZBVZeroBubble,
     get_schedule_class,
 )
 
@@ -210,7 +212,7 @@ def pipeline_module_split(
     Args:
         whole_model: The complete model to split
         pp_mesh: Pipeline parallel device mesh
-        pp_schedule: Schedule type ("1F1B" or "Interleaved1F1B")
+        pp_schedule: Schedule type ("1F1B", "Interleaved1F1B", or "ZBVZeroBubble")
         device: Target device for stages
         module_names_per_stage: Module FQNs for each stage
 
@@ -297,29 +299,31 @@ def _get_stage_indices() -> tuple[int, ...]:
         Examples (pp_degree=4, num_stages=8):
             1F1B:            Rank 0->(0,), Rank 1->(1,), ...
             Interleaved1F1B: Rank 0->(0,4), Rank 1->(1,5), Rank 2->(2,6), Rank 3->(3,7)
+            ZBVZeroBubble:   Rank 0->(0,7), Rank 1->(1,6), Rank 2->(2,5), Rank 3->(3,4)
         """
         if num_stages % pp_degree != 0:
             raise ValueError(
-                f"num_stages ({num_stages}) must be divisible by pp_degree ({pp_degree})"
+                f"num_stages ({num_stages}) must be evenly divisible by "
+                f"pp_degree ({pp_degree})"
             )
         stages_per_rank = num_stages // pp_degree
 
-        if pp_schedule == "1F1B":
-            if stages_per_rank != 1:
-                raise ValueError(
-                    f"1F1B schedule requires exactly 1 stage per rank, "
-                    f"got {stages_per_rank} ({num_stages} stages / {pp_degree} ranks)"
-                )
-            return (pp_rank,)
-        elif pp_schedule == "Interleaved1F1B":
-            if stages_per_rank < 2:
+        schedule_class = get_schedule_class(pp_schedule)
+        v_style_schedules = (ScheduleZBVZeroBubble, ScheduleDualPipeV)
+        style = "v" if schedule_class in v_style_schedules else "loop"
+
+        if style == "v":
+            if stages_per_rank != 2:
                 raise ValueError(
-                    f"Interleaved1F1B schedule requires >= 2 stages per rank, "
-                    f"got {stages_per_rank} ({num_stages} stages / {pp_degree} ranks)"
+                    f"V-style schedules require exactly 2 stages per rank, "
+                    f"got {stages_per_rank}"
                 )
-            return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank))
+            stage_v_pairs = list(
+                zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1))
+            )
+            return stage_v_pairs[pp_rank]
         else:
-            raise ValueError(f"Unknown pp_schedule: {pp_schedule}")
+            return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank))
 
     stages: list[PipelineStage] = []
     model_parts: list[nn.Module] = []
@@ -351,7 +355,7 @@ def pipeline_llm(
 
     Workflow:
     1. Generate module names for each virtual stage
-    2. Split model into stages (multiple per rank for Interleaved1F1B)
+    2. Split model into stages (multiple per rank for Interleaved1F1B/ZBVZeroBubble)
     3. Apply parallelization (TP, FSDP) to each model part
 
     Args:
@@ -364,7 +368,7 @@ def pipeline_llm(
 
     Returns:
         Tuple of:
-        - stages: List of PipelineStage (1 for 1F1B, 2+ for Interleaved1F1B)
+        - stages: List of PipelineStage (1 for 1F1B, 2+ for Interleaved1F1B/ZBVZeroBubble)
         - model_parts: List of model parts
         - has_first_stage: Whether this rank has the first stage
         - has_last_stage: Whether this rank has the last stage
@@ -429,6 +433,14 @@ def pipeline_llm(
                 f"but got {stages_per_rank} (from layers_per_stage={layers_per_stage}). "
                 f"Use 1F1B schedule for single stage per rank."
             )
+        # ZBVZeroBubble requires exactly 2 stages per rank
+        if schedule_class is ScheduleZBVZeroBubble and stages_per_rank != 2:
+            raise ValueError(
+                f"ZBVZeroBubble requires exactly 2 stages per rank, "
+                f"but got {stages_per_rank}. "
+                f"Set pp_layers_per_stage to achieve 2 stages per rank, "
+                f"or let it default (None)."
+            )
     else:
         # Default: 1 for single-stage schedules, 2 for multi-stage schedules
         stages_per_rank = 1 if is_single_stage_schedule else 2
diff --git a/areal/tests/experimental/archon/test_distributed_pp.py b/areal/tests/experimental/archon/test_distributed_pp.py
@@ -15,6 +15,10 @@
         - test_pp_forward_4gpu: PP=4, tests forward with more stages
         - test_pp_backward_4gpu: PP=4, tests backward with more stages
 
+    ZBV Tests (2 GPU):
+        - test_pp_zbv_forward_2gpu: PP=2, ZBVZeroBubble schedule forward
+        - test_pp_zbv_backward_2gpu: PP=2, ZBVZeroBubble schedule backward
+
     PP Combination Tests (4 GPU):
         - test_pp_tp_forward_4gpu: PP=2, TP=2, tests PP+TP combination
         - test_pp_dp_forward_4gpu: PP=2, DP=2, tests PP+DP combination
@@ -161,6 +165,55 @@ def test_pp_gradient_correctness_2gpu():
     )
 
 
+# =============================================================================
+# ZBV Tests (2 GPU)
+# =============================================================================
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_pp_zbv_forward_2gpu():
+    """Test ZBVZeroBubble forward pass with 2 GPUs (pp=2).
+
+    Validates that PP model with ZBVZeroBubble schedule produces correct output.
+    Uses V-style stage assignment where rank 0 holds first and last stages.
+    """
+    if current_platform.device_count() < 2:
+        pytest.skip("This test requires 2 GPUs")
+
+    _run_pp_test_with_torchrun(
+        "areal/tests/experimental/archon/torchrun/run_pp_tests.py",
+        n_gpus=2,
+        extra_args=[
+            "--test_type=forward",
+            "--pp_size=2",
+            "--pp_schedule=ZBVZeroBubble",
+        ],
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.slow
+def test_pp_zbv_backward_2gpu():
+    """Test ZBVZeroBubble backward pass with 2 GPUs (pp=2).
+
+    Validates that gradients flow correctly through all PP stages
+    using ZBVZeroBubble V-style stage assignment.
+    """
+    if current_platform.device_count() < 2:
+        pytest.skip("This test requires 2 GPUs")
+
+    _run_pp_test_with_torchrun(
+        "areal/tests/experimental/archon/torchrun/run_pp_tests.py",
+        n_gpus=2,
+        extra_args=[
+            "--test_type=backward",
+            "--pp_size=2",
+            "--pp_schedule=ZBVZeroBubble",
+        ],
+    )
+
+
 # =============================================================================
 # 4 GPU Tests (Extended PP tests)
 # =============================================================================
diff --git a/areal/tests/experimental/archon/test_pipeline_parallel.py b/areal/tests/experimental/archon/test_pipeline_parallel.py
@@ -221,3 +221,30 @@ def test_exact_distribution(self):
         assert result[2] == ["layers.3", "layers.4"]
         # Stage 3: 1 layer + norm + output (1) = 2
         assert result[3] == ["layers.5", "norm", "output"]
+
+
+class TestZBVFqnGeneration:
+    """Test FQN generation for ZBV pipeline configurations."""
+
+    def test_zbv_fqn_generation(self):
+        """Verify FQN distribution for a typical ZBV config (pp_degree=2, 8 layers)."""
+        result = generate_llm_fqn_per_model_part(num_stages=4, num_layers=8)
+        assert len(result) == 4
+
+        # Rank 0 gets stages (0, 3), rank 1 gets stages (1, 2)
+        rank0_modules = result[0] + result[3]
+        rank1_modules = result[1] + result[2]
+
+        # Rank 0 has first and last stages
+        assert "tok_embeddings" in rank0_modules
+        assert "norm" in rank0_modules
+        assert "output" in rank0_modules
+
+        # Rank 1 has only middle layers (no embeddings or output head)
+        assert all(m.startswith("layers.") for m in rank1_modules)
+
+        # All layers covered exactly once
+        all_layers = []
+        for stage in result:
+            all_layers.extend([m for m in stage if m.startswith("layers.")])
+        assert all_layers == [f"layers.{i}" for i in range(8)]
diff --git a/areal/tests/experimental/archon/torchrun/run_pp_tests.py b/areal/tests/experimental/archon/torchrun/run_pp_tests.py
diff --git a/docs/cli_reference.md b/docs/cli_reference.md