fix: clone async checkpoint tensors to CPU to prevent GPU OOM

Nytrynox · Nytrynox · commit 71adffe3645f · 2026-03-31T15:49:29.000+05:30
Move tensor cloning to CPU in AsyncCheckpointIO._clone_tensor() to prevent doubling GPU memory usage during async checkpoint saves. Previously, _clone_tensor() called t.detach().clone() which allocates new GPU memory for each cloned tensor. For large model checkpoints (e.g., 15GB+), this can cause GPU OOM errors since the entire checkpoint is temporarily duplicated in GPU memory. The fix changes the operation to t.detach().cpu().clone(), which moves tensors to CPU before cloning. CPU memory is typically abundant and this achieves the same race-condition prevention without the GPU memory overhead. Fixes #21630
diff --git a/src/lightning/pytorch/plugins/io/async_plugin.py b/src/lightning/pytorch/plugins/io/async_plugin.py
@@ -95,6 +95,8 @@ def teardown(self) -> None:
 
 # snapshot the checkpoint payload on the caller thread to avoid races with parameter mutation
 def _clone_tensor(t: torch.Tensor) -> torch.Tensor:
-    """Clones a tensor on the caller thread."""
-    # detach to avoid autograd history and clone to take a point-in-time copy
-    return t.detach().clone()
+    """Clones a tensor to CPU on the caller thread."""
+    # detach to avoid autograd history, move to CPU to avoid doubling GPU memory usage, and clone to take a
+    # point-in-time copy. Moving to CPU first is important because clone() on a CUDA tensor allocates new GPU memory,
+    # which can cause OOM errors for large model checkpoints.
+    return t.detach().cpu().clone()
diff --git a/tests/tests_pytorch/plugins/test_async_checkpoint.py b/tests/tests_pytorch/plugins/test_async_checkpoint.py
@@ -51,3 +51,19 @@ def test_async_checkpoint_should_snapshot_values_before_mutation():
         "AsyncCheckpointIO must snapshot the checkpoint (clone tensors) on the main thread "
         "to avoid races with parameter mutation; got mutated value instead"
     )
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_async_checkpoint_clones_tensors_to_cpu():
+    """Verify that _clone_tensor moves tensors to CPU to avoid doubling GPU memory usage."""
+    from lightning.pytorch.plugins.io.async_plugin import _clone_tensor
+
+    t = torch.tensor([1.0, 2.0, 3.0])
+    cloned = _clone_tensor(t)
+
+    # cloned tensor should be on CPU
+    assert cloned.device == torch.device("cpu"), f"Expected CPU tensor, got {cloned.device}"
+    # values should match
+    assert torch.equal(cloned, t)
+    # cloned tensor should not share storage with the original
+    assert cloned.data_ptr() != t.data_ptr()