AbdelStark
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/compare_residuals.rs‎
Lines changed: 2 additions & 2 deletions b/‎examples/compare_residuals.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/demo_tui.rs‎
Lines changed: 5 additions & 21 deletions b/‎examples/demo_tui.rs‎
Lines changed: 5 additions & 21 deletions
diff --git a/‎examples/visualize_weights.rs‎
Lines changed: 1 addition & 1 deletion b/‎examples/visualize_weights.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/attn_res_op.rs‎
Lines changed: 56 additions & 25 deletions b/‎src/attn_res_op.rs‎
Lines changed: 56 additions & 25 deletions
diff --git a/‎src/layer.rs‎
Lines changed: 48 additions & 31 deletions b/‎src/layer.rs‎
Lines changed: 48 additions & 31 deletions
@@ -32,7 +32,7 @@ h = h_l + F(h_l)              V = [b₀; b₁; …; bₙ]      ← stack all blo
                                h = Σ αᵢ · Vᵢ             ← weighted combination
 ```
 
-Each transformer layer has **two** AttnRes operations (before self-attention and before MLP), each with its own learned pseudo-query vector **w_l** initialized to zero. At initialization, all blocks receive equal weight (standard residual behavior). During training, the model learns to selectively route information from the most relevant depths.
+Each transformer layer has **two** AttnRes operations (before self-attention and before MLP), each with its own learned pseudo-query vector **w_l** initialized to zero. At initialization, all available sources receive equal weight. During training, the model learns to selectively route information from the most relevant depths.
 
 ## Quick Start
 
 
@@ -1,7 +1,7 @@
 //! Compare standard residual connections vs AttnRes.
 //!
 //! Shows that an AttnRes model with zero-initialized pseudo-queries
-//! starts as equivalent to uniform averaging over all prior blocks,
+//! starts as uniform averaging over all prior blocks,
 //! and demonstrates the forward pass works correctly.
 //!
 //! Run with: `cargo run --example compare_residuals`
@@ -20,7 +20,7 @@ fn main() {
 
     // Demo 1: Zero-init produces uniform weights (equivalent to mean)
     println!("1. Zero-initialized AttnRes = uniform averaging");
-    println!("   (equivalent to standard residual connections)\n");
+    println!("   (equal weights over all available sources)\n");
 
     let config = AttnResConfig::new(32, 4, 2);
     let op: AttnResOp<B> = config.init_op(&device);
 
@@ -204,10 +204,12 @@ fn train_step(
 fn compute_alpha(
     op: &AttnResOp<AB>,
     blocks: &[Tensor<AB, 3>],
-    partial: &Tensor<AB, 3>,
+    partial: Option<&Tensor<AB, 3>>,
 ) -> Vec<f32> {
     let mut sources: Vec<Tensor<AB, 3>> = blocks.to_vec();
-    sources.push(partial.clone());
+    if let Some(partial) = partial {
+        sources.push(partial.clone());
+    }
     let n = sources.len();
 
     let v = Tensor::stack(sources, 0); // [N+1, B, T, D]
@@ -268,26 +270,8 @@ fn extract_diagnostics(
         norms.push(norm_a);
         norms.push(norm_m);
 
-        // Replicate boundary handling from layer.forward() to get
-        // the correct block state for alpha computation.
-        let current_partial = state
-            .partial_block
-            .clone()
-            .unwrap_or_else(|| Tensor::zeros_like(state.blocks.last().unwrap()));
-
-        let at_boundary = layer.is_at_boundary();
-        let mut blocks_snap = state.blocks.clone();
-        if at_boundary {
-            blocks_snap.push(current_partial.clone());
-        }
-        let partial_snap = if at_boundary {
-            Tensor::zeros_like(blocks_snap.last().unwrap())
-        } else {
-            current_partial
-        };
-
         // Compute actual attention weights for the attn sublayer
-        let alpha = compute_alpha(attn_res, &blocks_snap, &partial_snap);
+        let alpha = compute_alpha(attn_res, &state.blocks, state.partial_block.as_ref());
         depth_weights.push(alpha);
 
         // Run the real forward to advance block state
 
@@ -141,7 +141,7 @@ fn main() {
     println!("  - This allows selective information routing across depth");
     println!();
     println!("  At initialization (zero pseudo-queries), all layers attend");
-    println!("  uniformly, equivalent to standard residual connections.");
+    println!("  uniformly across all available sources.");
     println!("  Training gradually differentiates the attention patterns.");
 
     println!("\nDone!");
 
@@ -34,7 +34,7 @@ impl AttnResConfig {
     ///
     /// The pseudo-query is zero-initialized per the paper's requirement for
     /// training stability. This means the operation starts as uniform averaging
-    /// over all sources (equivalent to standard residual connections).
+    /// over all available sources.
     pub fn init_op<B: Backend>(&self, device: &B::Device) -> AttnResOp<B> {
         AttnResOp {
             // CRITICAL: zero initialization per paper requirement
@@ -47,47 +47,62 @@ impl AttnResConfig {
 }
 
 impl<B: Backend> AttnResOp<B> {
-    /// Compute attention residual over block representations.
-    ///
-    /// # Arguments
-    /// * `blocks` - Completed block representations [N tensors of shape [B, T, D]]
-    /// * `partial_block` - Current intra-block partial sum [B, T, D]
+    /// Compute attention residual over any available block representations.
     ///
-    /// # Returns
-    /// * Attention-weighted combination of all sources [B, T, D]
-    pub fn forward(&self, blocks: &[Tensor<B, 3>], partial_block: &Tensor<B, 3>) -> Tensor<B, 3> {
-        // Step 1: Stack all sources into value matrix
-        // V: [N+1, B, T, D]
+    /// `partial_block` is optional because the first sublayer of the network
+    /// and the first sublayer of each new block attend only over completed
+    /// blocks (Eq. 6 in the paper) and therefore have no intra-block partial.
+    pub fn forward_optional_partial(
+        &self,
+        blocks: &[Tensor<B, 3>],
+        partial_block: Option<&Tensor<B, 3>>,
+    ) -> Tensor<B, 3> {
         let mut sources: Vec<Tensor<B, 3>> = blocks.to_vec();
-        sources.push(partial_block.clone());
-        let v = Tensor::stack(sources, 0); // [N+1, B, T, D]
+        if let Some(partial_block) = partial_block {
+            sources.push(partial_block.clone());
+        }
+
+        assert!(
+            !sources.is_empty(),
+            "AttnResOp requires at least one source tensor"
+        );
+
+        // Step 1: Stack all sources into value matrix
+        // V: [N, B, T, D] or [N+1, B, T, D]
+        let v = Tensor::stack(sources, 0);
 
         // Step 2: Apply RMSNorm to get keys
-        // K: [N+1, B, T, D]
+        // K: same shape as V
         let k = self.norm.forward_4d(v.clone());
 
         // Step 3: Compute attention logits
-        // w: [D] -> [1, 1, 1, D] for broadcasting
-        // logits = sum(K * w, dim=3) -> [N+1, B, T]
         let w = self
             .pseudo_query
             .val()
             .unsqueeze_dim::<2>(0)
             .unsqueeze_dim::<3>(0)
             .unsqueeze_dim::<4>(0); // [1, 1, 1, D]
-        let logits = (k * w).sum_dim(3).squeeze_dim::<3>(3); // [N+1, B, T]
+        let logits = (k * w).sum_dim(3).squeeze_dim::<3>(3);
 
         // Step 4: Softmax over the depth dimension (dim=0)
-        // CRITICAL: softmax over depth, NOT sequence
-        let alpha = softmax(logits, 0); // [N+1, B, T]
+        let alpha = softmax(logits, 0);
 
         // Step 5: Weighted sum of values
-        // alpha: [N+1, B, T] -> [N+1, B, T, 1]
-        // v: [N+1, B, T, D]
-        // result: sum over dim=0 -> [B, T, D]
-        let alpha_expanded = alpha.unsqueeze_dim::<4>(3); // [N+1, B, T, 1]
-        let weighted = v * alpha_expanded; // [N+1, B, T, D]
-        weighted.sum_dim(0).squeeze_dim::<3>(0) // [B, T, D]
+        let alpha_expanded = alpha.unsqueeze_dim::<4>(3);
+        let weighted = v * alpha_expanded;
+        weighted.sum_dim(0).squeeze_dim::<3>(0)
+    }
+
+    /// Compute attention residual over block representations.
+    ///
+    /// # Arguments
+    /// * `blocks` - Completed block representations [N tensors of shape [B, T, D]]
+    /// * `partial_block` - Current intra-block partial sum [B, T, D]
+    ///
+    /// # Returns
+    /// * Attention-weighted combination of all sources [B, T, D]
+    pub fn forward(&self, blocks: &[Tensor<B, 3>], partial_block: &Tensor<B, 3>) -> Tensor<B, 3> {
+        self.forward_optional_partial(blocks, Some(partial_block))
     }
 }
 
@@ -160,4 +175,20 @@ mod tests {
         let diff: f32 = (output - expected).abs().max().into_scalar();
         assert!(diff < 1e-4, "Single block should produce mean, diff={diff}");
     }
+
+    #[test]
+    fn test_blocks_only_returns_only_source() {
+        let device = Default::default();
+        let config = AttnResConfig::new(32, 4, 2);
+        let op = config.init_op::<TestBackend>(&device);
+
+        let embedding = Tensor::random([1, 8, 32], Distribution::Normal(0.0, 1.0), &device);
+        let output = op.forward_optional_partial(&[embedding.clone()], None);
+
+        let diff: f32 = (output - embedding).abs().max().into_scalar();
+        assert!(
+            diff < 1e-5,
+            "A single completed block should be returned unchanged, diff={diff}"
+        );
+    }
 }
@@ -64,6 +64,26 @@ impl AttnResConfig {
 }
 
 impl<B: Backend> AttnResLayer<B> {
+    fn attn_sublayer_idx(&self) -> usize {
+        self.layer_idx * 2
+    }
+
+    fn mlp_sublayer_idx(&self) -> usize {
+        self.attn_sublayer_idx() + 1
+    }
+
+    fn starts_new_block_before_sublayer(&self, sublayer_idx: usize) -> bool {
+        sublayer_idx > 0 && sublayer_idx.is_multiple_of(self.block_size)
+    }
+
+    pub(crate) fn starts_new_block_before_attn(&self) -> bool {
+        self.starts_new_block_before_sublayer(self.attn_sublayer_idx())
+    }
+
+    pub(crate) fn starts_new_block_before_mlp(&self) -> bool {
+        self.starts_new_block_before_sublayer(self.mlp_sublayer_idx())
+    }
+
     /// Get the layer index.
     pub fn layer_idx(&self) -> usize {
         self.layer_idx
@@ -74,10 +94,14 @@ impl<B: Backend> AttnResLayer<B> {
         self.block_size
     }
 
-    /// Check if this layer is at a block boundary.
+    /// Check if this layer's attention sublayer starts a new block.
+    ///
+    /// Block sizing is defined in sublayers, so the MLP sublayer can also
+    /// start a new block when `block_size` is odd or when `block_size == 1`
+    /// (Full AttnRes). This helper preserves the historical public API by
+    /// reporting only the pre-attention boundary.
     pub fn is_at_boundary(&self) -> bool {
-        let half_block = self.block_size / 2;
-        self.layer_idx > 0 && (half_block == 0 || self.layer_idx.is_multiple_of(half_block))
+        self.starts_new_block_before_attn()
     }
 
     /// Get references to the AttnRes operations (attn_res, mlp_res).
@@ -116,33 +140,18 @@ impl<B: Backend> AttnResLayer<B> {
     /// # Returns
     /// * Updated block state
     pub fn forward(&self, mut state: BlockState<B>, mask: Option<&Tensor<B, 3>>) -> BlockState<B> {
-        // Get the current partial block, or zeros if at the start of a new block
-        let current_partial = state
-            .partial_block
-            .take()
-            .unwrap_or_else(|| Tensor::zeros_like(state.blocks.last().unwrap()));
-
-        // === Check block boundary ===
-        // Block boundary occurs every block_size/2 transformer layers (each layer = 2 sublayers).
-        // For Full AttnRes (block_size=1), every layer after the first is a boundary.
-        let half_block = self.block_size / 2;
-        let at_boundary =
-            self.layer_idx > 0 && (half_block == 0 || self.layer_idx.is_multiple_of(half_block));
-
-        if at_boundary {
-            // Push the completed partial block as a new block
-            state.blocks.push(current_partial.clone());
-        }
-
-        // The partial block for AttnRes input: if we just pushed, start fresh; otherwise use current
-        let partial_for_attn = if at_boundary {
-            Tensor::zeros_like(state.blocks.last().unwrap())
-        } else {
-            current_partial
-        };
-
         // === AttnRes before self-attention ===
-        let h = self.attn_res.forward(&state.blocks, &partial_for_attn);
+        let current_partial = state.partial_block.take();
+        let h = self
+            .attn_res
+            .forward_optional_partial(&state.blocks, current_partial.as_ref());
+
+        let mut partial_for_attn =
+            current_partial.unwrap_or_else(|| Tensor::zeros_like(state.blocks.last().unwrap()));
+        if self.starts_new_block_before_attn() {
+            state.blocks.push(partial_for_attn.clone());
+            partial_for_attn = Tensor::zeros_like(state.blocks.last().unwrap());
+        }
 
         // === Self-attention sublayer ===
         let normed = self.attn_norm.forward(h);
@@ -152,14 +161,22 @@ impl<B: Backend> AttnResLayer<B> {
         let partial_after_attn = partial_for_attn + attn_out;
 
         // === AttnRes before MLP ===
-        let h = self.mlp_res.forward(&state.blocks, &partial_after_attn);
+        let h = self
+            .mlp_res
+            .forward_optional_partial(&state.blocks, Some(&partial_after_attn));
+
+        let mut partial_for_mlp = partial_after_attn;
+        if self.starts_new_block_before_mlp() {
+            state.blocks.push(partial_for_mlp.clone());
+            partial_for_mlp = Tensor::zeros_like(state.blocks.last().unwrap());
+        }
 
         // === MLP sublayer ===
         let normed = self.mlp_norm.forward(h);
         let mlp_out = self.mlp.forward(normed);
 
         // Update partial block with MLP output
-        let partial_after_mlp = partial_after_attn + mlp_out;
+        let partial_after_mlp = partial_for_mlp + mlp_out;
 
         state.partial_block = Some(partial_after_mlp);
         state