inclusionAI
diff --git a/‎areal/experimental/openai/cache.py‎
Lines changed: 16 additions & 0 deletions b/‎areal/experimental/openai/cache.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎areal/reward/__init__.py‎
Lines changed: 60 additions & 20 deletions b/‎areal/reward/__init__.py‎
Lines changed: 60 additions & 20 deletions
diff --git a/‎examples/scaffolding/README.md‎
Lines changed: 227 additions & 0 deletions b/‎examples/scaffolding/README.md‎
Lines changed: 227 additions & 0 deletions
@@ -17,6 +17,22 @@ def __init__(self, *args, **kwargs):
         self._total_reward = 0.0
         self._lock = threading.Lock()
 
+    def __deepcopy__(self, memo):
+        """Allow deep-copy of the empty cache.
+
+        ``threading.Lock`` cannot be deep-copied.  Controllers that hold
+        an ``InteractionCache`` (e.g. ``ChatTracer``) are cloned via
+        ``Controller.clone()`` (``copy.deepcopy``).  The cache must be
+        empty at clone time; a non-empty cache indicates a bug in the
+        caller.
+        """
+        assert len(self) == 0, (
+            f"InteractionCache must be empty when deep-copied, but has {len(self)} items"
+        )
+        new = InteractionCache()
+        memo[id(self)] = new
+        return new
+
     @property
     def last_interaction_id(self) -> str:
         return next(reversed(self))
 
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from math_verify.metric import math_metric
-from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
+import concurrent.futures
+
+from math_verify.grader import verify as math_verify_verify
+from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig, parse
 
 from areal.utils import logging
 
@@ -29,38 +31,76 @@ def get_custom_reward_fn(path: str, **kwargs):
 class MathVerifyWorker:
     """Thin wrapper over math_verify with configurable extraction/precision.
 
+    Uses ``parse()`` + ``verify()`` directly instead of ``math_metric()``
+    so that signal-based timeouts can be disabled (``parsing_timeout=None``,
+    ``timeout_seconds=None``). This avoids ``signal.alarm()`` which only
+    works in the main thread. A thread-safe timeout is enforced via
+    ``concurrent.futures`` instead.
+
     Args:
         try_extract_without_anchor: When False, only answers with explicit anchors
             (e.g., "answer = 1", "final answer = 1") are matched. When True,
             any numeric string in the text may be extracted.
         precision: Number of significant digits that must match.
+        timeout: Thread-safe timeout in seconds for the entire verify call
+            (parsing + comparison). ``None`` disables the timeout.
 
     Notes:
         Tune these knobs based on dataset format and model output style.
     """
 
-    def __init__(self, try_extract_without_anchor=True, precision: int = 6):
-        self.verify_func = math_metric(
-            gold_extraction_target=(
-                ExprExtractionConfig(
-                    try_extract_without_anchor=try_extract_without_anchor
-                ),
-                LatexExtractionConfig(),
-            ),
-            pred_extraction_target=(
-                ExprExtractionConfig(
-                    try_extract_without_anchor=try_extract_without_anchor
-                ),
-                LatexExtractionConfig(),
-            ),
-            precision=precision,
+    def __init__(
+        self,
+        try_extract_without_anchor=True,
+        precision: int = 6,
+        timeout: float | None = 5.0,
+    ):
+        self.gold_extraction_target = (
+            ExprExtractionConfig(try_extract_without_anchor=try_extract_without_anchor),
+            LatexExtractionConfig(),
+        )
+        self.pred_extraction_target = (
+            ExprExtractionConfig(try_extract_without_anchor=try_extract_without_anchor),
+            LatexExtractionConfig(),
+        )
+        self.precision = precision
+        self.timeout = timeout
+
+    def _verify_impl(self, response: str, ground_truth: str) -> float:
+        """Core verification logic without timeout wrapper."""
+        gold_parsed = parse(
+            ground_truth,
+            extraction_config=self.gold_extraction_target,
+            parsing_timeout=None,
+        )
+        pred_parsed = parse(
+            response,
+            extraction_config=self.pred_extraction_target,
+            parsing_timeout=None,
         )
+        if not gold_parsed or not pred_parsed:
+            return 0.0
+        result = math_verify_verify(
+            gold_parsed,
+            pred_parsed,
+            float_rounding=self.precision,
+            timeout_seconds=None,
+        )
+        return 1.0 if result else 0.0
 
     def verify(self, response: str, ground_truth: str) -> float:
-        # ground_truth_parsable = "\\boxed{" + ground_truth + "}"
         try:
-            ret_score, _ = self.verify_func([ground_truth], [response])
-            return float(ret_score)
+            if self.timeout is None:
+                return self._verify_impl(response, ground_truth)
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(self._verify_impl, response, ground_truth)
+                return future.result(timeout=self.timeout)
+        except concurrent.futures.TimeoutError:
+            logger.warning(
+                f"Timeout ({self.timeout}s) in MathVerifyWorker.verify for "
+                f"response={response!r} and ground_truth={ground_truth!r}",
+            )
+            return 0.0
         except Exception:
             logger.warning(
                 f"Exception in MathVerifyWorker.verify for response={response} and ground_truth={ground_truth}",
 
@@ -0,0 +1,227 @@
+# Scaffolding Framework Examples for AReaL
+
+This directory contains examples demonstrating how to use the Scaffolding framework with
+AReaL for reinforcement learning training.
+
+## Overview
+
+The scaffolding framework provides a modular and extensible way to compose
+various methods with RL training. It decouples the inference logic
+(Controllers) from the execution backend (Workers), enabling flexible composition of
+different methods. With Scaffolding, we can flexibly compose various rollout, reward, and trajectory tracing methods.
+
+### Key Components
+
+1. **Controller**: Defines the inference-time compute logic (e.g., generation, reward
+   computation)
+1. **Worker**: Handles the actual execution of tasks (e.g., TRT-LLM, OpenAI API)
+1. **ScaffoldingLlm**: Orchestrates controllers and workers together
+1. **ScaffoldingWorkflow**: Wraps ScaffoldingLlm as a RolloutWorkflow for AReaL training
+
+### AReaL-Specific Components
+
+The following components are implemented in `examples/scaffolding/`:
+
+- **`CreateWorkerFromEngine`**: Creates a scaffolding Worker from AReaL's
+  InferenceEngine (e.g., RemoteSGLangEngine). The returned Worker is similar to
+  scaffolding's `OpenaiWorker` but integrated with AReaL's engine.
+
+- **`RLVRRewardController`**: A Controller that computes rewards for generated samples
+  using verifiable reward functions (e.g., math answer verification).
+
+- **`PipelineTrajectoryMaker`**: A Controller that composes generation and reward
+  controllers into a pipeline that produces training trajectories.
+
+- **`ScaffoldingWorkflow`**: A `RolloutWorkflow` implementation that wraps
+  ScaffoldingLlm for integration with AReaL's training pipeline.
+
+## RLVR Example with GSM8K
+
+### Quick Start
+
+```bash
+python examples/scaffolding/gsm8k_rlvr_scaffolding.py \
+    --config examples/scaffolding/gsm8k_rlvr_scaffolding.yaml
+```
+
+### Architecture
+
+The scaffolding workflow follows this pattern from the RFC:
+
+```python
+# Step 1: Create Worker from the SGLang engine
+rollout_worker = CreateWorkerFromEngine(engine)
+
+# Step 2: Create controllers
+rollout_controller = NativeGenerationController()
+reward_controller = RLVRRewardController(gsm8k_reward_fn)
+
+# Step 3: Create trajectory maker (composes the controllers)
+trajectory_maker = PipelineTrajectoryMaker(rollout_controller, reward_controller)
+
+# Step 4: Create ScaffoldingLlm (orchestrates controllers with workers)
+scaffolding_llm = ScaffoldingLlm(
+    trajectory_maker,
+    {NativeGenerationController.WorkerTag.GENERATION: rollout_worker},
+)
+
+# Step 5: Create ScaffoldingWorkflow (wraps as RolloutWorkflow)
+scaffolding_workflow = ScaffoldingWorkflow(scaffolding_llm)
+```
+
+### Data Flow Diagram
+
+```
+                              ┌─────────────────────────────────────────────────┐
+                              │              ScaffoldingWorkflow                │
+                              │                                                 │
+                              │  ┌───────────────────────────────────────────┐  │
+                              │  │            ScaffoldingLlm                 │  │
+                              │  │                                           │  │
+                              │  │  ┌─────────────────────────────────────┐  │  │
+                              │  │  │      PipelineTrajectoryMaker        │  │  │
+                              │  │  │                                     │  │  │
+                              │  │  │  ┌───────────────────────────────┐  │  │  │
+Data ─────────────────────────┼──┼──┼──►  NativeGenerationController   │  │  │  │
+                              │  │  │  │  (from scaffolding.core)       │  │  │  │
+                              │  │  │  └───────────────┬───────────────┘  │  │  │
+                              │  │  │                  │                  │  │  │
+                              │  │  │                  ▼                  │  │  │
+                              │  │  │  ┌───────────────────────────────┐  │  │  │
+                              │  │  │  │  RLVRRewardController         │  │  │  │
+                              │  │  │  │  (from areal.experimental)    │  │  │  │
+                              │  │  │  └───────────────┬───────────────┘  │  │  │
+                              │  │  │                  │                  │  │  │
+                              │  │  └──────────────────┼──────────────────┘  │  │
+                              │  │                     │                     │  │
+                              │  └─────────────────────┼─────────────────────┘  │
+                              │                        │                        │
+                              └────────────────────────┼────────────────────────┘
+                                                       │
+                                                       ▼ Trajectories
+                                         ┌─────────────────────────────┐
+                                         │       PPOTrainer            │
+                                         │   (GRPO/PPO Training)       │
+                                         └─────────────────────────────┘
+                                                       │
+                         via CreateWorkerFromEngine    │
+                                                       ▼
+                              ┌─────────────────────────────────────────┐
+                              │         RemoteSGLangEngine              │
+                              │         (AReaL Inference Backend)       │
+                              └─────────────────────────────────────────┘
+```
+
+### How It Works
+
+1. **Engine Initialization**: `RemoteSGLangEngine` is initialized with the rollout
+   configuration and connected to the model server.
+
+1. **Worker Creation**: `CreateWorkerFromEngine(engine)` wraps the engine into a
+   scaffolding-compatible Worker. This allows scaffolding controllers to use AReaL's
+   inference backends.
+
+1. **Controller Pipeline**:
+
+   - `NativeGenerationController()`: Handles text generation by yielding
+     `GenerationTask` objects to the Worker.
+   - `RLVRRewardController(reward_fn)`: Computes rewards for generated samples using the
+     provided reward function.
+   - `PipelineTrajectoryMaker(gen_ctrl, reward_ctrl)`: Composes these controllers into a
+     pipeline that produces training trajectories.
+
+1. **ScaffoldingLlm**: Orchestrates the trajectory maker with the worker, handling the
+   async execution of tasks.
+
+1. **ScaffoldingWorkflow**: Wraps the ScaffoldingLlm as a `RolloutWorkflow` that can be
+   used directly with AReaL's `PPOTrainer`.
+
+1. **Training**: The trainer calls the workflow to generate trajectories, which are then
+   used for GRPO/PPO training.
+
+### Configuration
+
+See `gsm8k_rlvr_scaffolding.yaml` for the full configuration. Key options:
+
+```yaml
+# Model configuration
+pretrain_path: Qwen/Qwen2.5-3B-Instruct
+tokenizer_path: Qwen/Qwen2.5-3B-Instruct
+
+# Generation hyperparameters
+gconfig:
+  max_new_tokens: 1024
+  temperature: 1.0
+  top_p: 1.0
+  n_samples: 8
+
+# Inference engine configuration
+engine:
+  type: sglang
+  tp: 1
+  max_model_len: 4096
+```
+
+## Extending the Framework
+
+### Custom Reward Controllers
+
+You can create custom reward controllers by subclassing the base Controller:
+
+```python
+from examples.scaffolding._compat import Controller
+
+class CustomRewardController(Controller):
+    def __init__(self, reward_fn):
+        super().__init__()
+        self.reward_fn = reward_fn
+
+    def process(self, tasks, **kwargs):
+        # Compute rewards for completed generation tasks
+        for task in tasks:
+            reward = self.reward_fn(
+                prompt=task.input_str,
+                completion=task.output_str,
+                **kwargs
+            )
+            task.customized_result_fields["reward"] = reward
+        yield tasks
+```
+
+### Custom Trajectory Makers
+
+For different RL algorithms, you may need different trajectory formats:
+
+```python
+from examples.scaffolding._compat import Controller
+import torch
+
+class CustomTrajectoryMaker(Controller):
+    def __init__(self, generation_controller, reward_controller):
+        super().__init__()
+        self.generation_controller = generation_controller
+        self.reward_controller = reward_controller
+
+    def process(self, tasks, **kwargs):
+        # Run generation
+        yield from self.generation_controller.process(tasks, **kwargs)
+
+        # Run reward computation
+        yield from self.reward_controller.process(tasks, **kwargs)
+
+        # Build trajectories
+        trajectories = []
+        for task in tasks:
+            trajectory = {
+                "input_ids": torch.tensor(task.output_tokens),
+                "rewards": torch.tensor(task.customized_result_fields["reward"]),
+            }
+            trajectories.append(trajectory)
+        yield trajectories
+```
+
+## References
+
+- [TensorRT-LLM Scaffolding README](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/scaffolding)
+- [AReaL Workflow Documentation](../../docs/customization/workflow.md)
+- [RFC: Scaffolding Integration](https://github.com/inclusionAI/AReaL/issues/818)