Update blackwell tutorial to be compatible with 4.5-dev version (#3130)

LongshengDu · web-flow · commit 08185b9c3e90 · 2026-04-09T14:40:33.000+08:00
* Update blackwell tutorial to be compatible with 4.5-dev version

* update example for reverted changes

* add more example fix
diff --git a/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py b/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py
@@ -647,7 +647,7 @@ class SharedStorage:
             ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
             acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
             acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
             # (EPI_TILE_M, EPI_TILE_N, STAGE)
             sC: cute.struct.Align[
@@ -826,11 +826,11 @@ def kernel(
 
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=self.tmem_alloc_barrier,
             allocator_warp_id=self.epilog_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py
@@ -648,7 +648,7 @@ class SharedStorage:
             acc_full_mbar_ptr: cute.struct.MemRange[
                 cutlass.Int64, self.num_acc_stage * 2
             ]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
 
         smem = utils.SmemAllocator()
@@ -699,11 +699,11 @@ class SharedStorage:
             )
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=tmem_alloc_barrier,
             allocator_warp_id=self.epilog_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py
@@ -219,11 +219,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True if use_2cta_instrs else False,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     # Partition tensors for TMA; This requires the tensors partitioned for MMA
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py
@@ -152,11 +152,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py
@@ -159,11 +159,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py
@@ -184,11 +184,11 @@ def cluster_specific_kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py
@@ -171,11 +171,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py
@@ -214,11 +214,11 @@ def gemm(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py
@@ -756,7 +756,7 @@ class SharedStorage:
             acc_full_mbar_ptr: cute.struct.MemRange[
                 cutlass.Int64, self.num_acc_stage * 2
             ]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
 
         smem = utils.SmemAllocator()
@@ -806,11 +806,11 @@ class SharedStorage:
             )
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=tmem_alloc_barrier,
             allocator_warp_id=self.epilog_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py
@@ -672,7 +672,7 @@ class SharedStorage:
             acc_full_mbar_ptr: cute.struct.MemRange[
                 cutlass.Int64, self.num_acc_stage * 2
             ]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
 
         smem = utils.SmemAllocator()
@@ -723,11 +723,11 @@ class SharedStorage:
             )
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=tmem_alloc_barrier,
             allocator_warp_id=self.epilogue_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py
@@ -541,7 +541,7 @@ class SharedStorage:
             ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
             acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
             acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
             # (EPI_TILE_M, EPI_TILE_N, STAGE)
             sC: cute.struct.Align[
@@ -660,8 +660,8 @@ def kernel(
         smem = utils.SmemAllocator()
         storage = smem.allocate(self.shared_storage)
 
-        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
-        tmem_holding_buf = storage.tmem_holding_buf
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar.ptr
+        tmem_holding_buf = storage.tmem_holding_buf.ptr
 
         # Initialize mainloop ab_pipeline (barrier) and states
         ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
diff --git a/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb b/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb
@@ -369,7 +369,7 @@
     "        num_threads=threads_per_cta,\n",
     "    )\n",
     "    tmem = utils.TmemAllocator(\n",
-    "        storage.tmem_holding_buf,\n",
+    "        storage.tmem_holding_buf.ptr,\n",
     "        barrier_for_retrieve=tmem_alloc_barrier,\n",
     "    )\n",
     "    num_tmem_cols = 512\n",
@@ -742,7 +742,7 @@
     "        num_threads=threads_per_cta,\n",
     "    )\n",
     "    tmem = utils.TmemAllocator(\n",
-    "        storage.tmem_holding_buf,\n",
+    "        storage.tmem_holding_buf.ptr,\n",
     "        barrier_for_retrieve=tmem_alloc_barrier,\n",
     "    )\n",
     "    num_tmem_cols = 512\n",

Original file line number	Diff line number	Diff line change
`@@ -219,11 +219,11 @@ def kernel(`
`219`	`219`	`* len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads`
`220`	`220`	`)`
`221`	`221`	`tmem = utils.TmemAllocator(`
`222`		`- storage.tmem_holding_buffer,`
	`222`	`+ storage.tmem_holding_buffer.ptr,`
`223`	`223`	`barrier_for_retrieve=tmem_alloc_barrier,`
`224`	`224`	`allocator_warp_id=epilogue_warp_ids[0],`
`225`	`225`	`is_two_cta=True if use_2cta_instrs else False,`
`226`		`- two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,`
	`226`	`+ two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,`
`227`	`227`	`)`
`228`	`228`
`229`	`229`	`# Partition tensors for TMA; This requires the tensors partitioned for MMA`
Original file line number	Diff line number	Diff line change
`@@ -152,11 +152,11 @@ def kernel(`
`152`	`152`	`* len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads`
`153`	`153`	`)`
`154`	`154`	`tmem = utils.TmemAllocator(`
`155`		`- storage.tmem_holding_buffer,`
	`155`	`+ storage.tmem_holding_buffer.ptr,`
`156`	`156`	`barrier_for_retrieve=tmem_alloc_barrier,`
`157`	`157`	`allocator_warp_id=epilogue_warp_ids[0],`
`158`	`158`	`is_two_cta=True,`
`159`		`- two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,`
	`159`	`+ two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,`
`160`	`160`	`)`
`161`	`161`
`162`	`162`	`num_tma_copy_bytes = (`
Original file line number	Diff line number	Diff line change
`@@ -159,11 +159,11 @@ def kernel(`
`159`	`159`	`* len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads`
`160`	`160`	`)`
`161`	`161`	`tmem = utils.TmemAllocator(`
`162`		`- storage.tmem_holding_buffer,`
	`162`	`+ storage.tmem_holding_buffer.ptr,`
`163`	`163`	`barrier_for_retrieve=tmem_alloc_barrier,`
`164`	`164`	`allocator_warp_id=epilogue_warp_ids[0],`
`165`	`165`	`is_two_cta=True,`
`166`		`- two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,`
	`166`	`+ two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,`
`167`	`167`	`)`
`168`	`168`
`169`	`169`	`num_tma_copy_bytes = (`
Original file line number	Diff line number	Diff line change
`@@ -184,11 +184,11 @@ def cluster_specific_kernel(`
`184`	`184`	`* len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads`
`185`	`185`	`)`
`186`	`186`	`tmem = utils.TmemAllocator(`
`187`		`- storage.tmem_holding_buffer,`
	`187`	`+ storage.tmem_holding_buffer.ptr,`
`188`	`188`	`barrier_for_retrieve=tmem_alloc_barrier,`
`189`	`189`	`allocator_warp_id=epilogue_warp_ids[0],`
`190`	`190`	`is_two_cta=True,`
`191`		`- two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,`
	`191`	`+ two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,`
`192`	`192`	`)`
`193`	`193`
`194`	`194`	`num_tma_copy_bytes = (`
Original file line number	Diff line number	Diff line change
`@@ -171,11 +171,11 @@ def kernel(`
`171`	`171`	`* len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads`
`172`	`172`	`)`
`173`	`173`	`tmem = utils.TmemAllocator(`
`174`		`- storage.tmem_holding_buffer,`
	`174`	`+ storage.tmem_holding_buffer.ptr,`
`175`	`175`	`barrier_for_retrieve=tmem_alloc_barrier,`
`176`	`176`	`allocator_warp_id=epilogue_warp_ids[0],`
`177`	`177`	`is_two_cta=True,`
`178`		`- two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,`
	`178`	`+ two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,`
`179`	`179`	`)`
`180`	`180`
`181`	`181`	`num_tma_copy_bytes = (`
Original file line number	Diff line number	Diff line change
`@@ -214,11 +214,11 @@ def gemm(`
`214`	`214`	`* len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads`
`215`	`215`	`)`
`216`	`216`	`tmem = utils.TmemAllocator(`
`217`		`- storage.tmem_holding_buffer,`
	`217`	`+ storage.tmem_holding_buffer.ptr,`
`218`	`218`	`barrier_for_retrieve=tmem_alloc_barrier,`
`219`	`219`	`allocator_warp_id=epilogue_warp_ids[0],`
`220`	`220`	`is_two_cta=True,`
`221`		`- two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,`
	`221`	`+ two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,`
`222`	`222`	`)`
`223`	`223`
`224`	`224`	`num_tma_copy_bytes = (`