perf: skip redundant clone() for CUDA tensors in async checkpoint

karthik-idikuda · karthik-idikuda · commit b55dbc089063 · 2026-04-01T07:03:48.000+05:30
For CUDA tensors, cpu() already allocates a new host-memory copy, so an
additional clone() is unnecessary and wastes memory bandwidth. For CPU
tensors cpu() is a no-op, so clone() remains necessary.

Co-authored-by: TheGreatFrankie
diff --git a/src/lightning/pytorch/plugins/io/async_plugin.py b/src/lightning/pytorch/plugins/io/async_plugin.py
@@ -95,8 +95,15 @@ def teardown(self) -> None:
 
 # snapshot the checkpoint payload on the caller thread to avoid races with parameter mutation
 def _clone_tensor(t: torch.Tensor) -> torch.Tensor:
-    """Clones a tensor to CPU on the caller thread."""
-    # detach to avoid autograd history, move to CPU to avoid doubling GPU memory usage, and clone to take a
-    # point-in-time copy. Moving to CPU first is important because clone() on a CUDA tensor allocates new GPU memory,
-    # which can cause OOM errors for large model checkpoints.
-    return t.detach().cpu().clone()
+    """Clone a tensor to CPU memory.
+
+    Detaches from autograd, moves to CPU, and ensures a point-in-time snapshot
+    that won't be mutated by ongoing training.
+
+    For CUDA tensors ``cpu()`` already allocates a new host-memory copy, so an
+    extra ``clone()`` is unnecessary.  For CPU tensors ``cpu()`` is a no-op, so
+    ``clone()`` is required to break storage sharing.
+    """
+    if t.is_cuda:
+        return t.detach().cpu()
+    return t.detach().clone()
diff --git a/tests/tests_pytorch/plugins/test_async_checkpoint.py b/tests/tests_pytorch/plugins/test_async_checkpoint.py
@@ -55,7 +55,7 @@ def test_async_checkpoint_should_snapshot_values_before_mutation():
 
 @pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_async_checkpoint_clones_tensors_to_cpu():
-    """Verify that _clone_tensor moves tensors to CPU to avoid doubling GPU memory usage."""
+    """Verify that _clone_tensor produces a CPU snapshot that does not share storage."""
     from lightning.pytorch.plugins.io.async_plugin import _clone_tensor
 
     t = torch.tensor([1.0, 2.0, 3.0])
@@ -67,3 +67,6 @@ def test_async_checkpoint_clones_tensors_to_cpu():
     assert torch.equal(cloned, t)
     # cloned tensor should not share storage with the original
     assert cloned.data_ptr() != t.data_ptr()
+    # mutation of the original must not affect the clone
+    t.add_(1.0)
+    assert torch.equal(cloned, torch.tensor([1.0, 2.0, 3.0]))