modelscope
diff --git a/‎ajet/backbone/trainer_verl.py‎
Lines changed: 5 additions & 5 deletions b/‎ajet/backbone/trainer_verl.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎ajet/backbone/verl/__init__.py‎
Lines changed: 0 additions & 3 deletions b/‎ajet/backbone/verl/__init__.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎ajet/backbone/verl/actor_config.py‎
Lines changed: 4 additions & 43 deletions b/‎ajet/backbone/verl/actor_config.py‎
Lines changed: 4 additions & 43 deletions
diff --git a/‎ajet/backbone/verl/dp_actor.py‎
Lines changed: 88 additions & 1 deletion b/‎ajet/backbone/verl/dp_actor.py‎
Lines changed: 88 additions & 1 deletion
diff --git a/‎ajet/backbone/verl/fsdp_workers.py‎
Lines changed: 3 additions & 3 deletions b/‎ajet/backbone/verl/fsdp_workers.py‎
Lines changed: 3 additions & 3 deletions
@@ -810,7 +810,7 @@ def _validate(self):
 
             # repeat test batch
             test_batch = test_batch.repeat(
-                repeat_times=self.config.ajet.rollout.val_kwargs.num_repeat,
+                repeat_times=self.config.ajet.trainer_common.val_pass_n,
                 interleave=True,
             )
 
@@ -858,10 +858,10 @@ def _validate(self):
             logger.info(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
 
             self.checkpoint_manager.update_weights(self.global_steps)
-            main_val_dataset = self.get_eval_dataset()
+            main_val_dataset = self.get_val_dataset()
 
             logger.info("Starting validate rollout")
-            context_tracker_arr, tasks, val_metrics = self.eval_dataset(
+            context_tracker_arr, tasks, val_metrics = self._rollout_val_dataset(
                 target_dataset=main_val_dataset,
                 target_dataset_name="main_val_dataset",
                 mode="validate",
@@ -920,7 +920,7 @@ def _validate(self):
 
         return metric_dict
 
-    def eval_dataset(self, target_dataset, target_dataset_name, mode, epoch):
+    def _rollout_val_dataset(self, target_dataset, target_dataset_name, mode, epoch):
         """
         Evaluate a dataset by running rollouts and computing task completion metrics.
 
@@ -1005,7 +1005,7 @@ def eval_dataset(self, target_dataset, target_dataset_name, mode, epoch):
 
         return ctx_trackers, tasks, val_metrics
 
-    def get_eval_dataset(self):
+    def get_val_dataset(self):
         from ajet.task_reader import RouterTaskReader
 
         task_reader = RouterTaskReader(
 
@@ -1,11 +1,8 @@
 from .fsdp_workers import AjetActorRolloutRefWorker, AjetAsyncActorRolloutRefWorker
-from .actor_config import AjetActorConfig, AjetFSDPActorConfig
 from .dp_actor import AjetDataParallelPPOActor
 
 __all__ = [
     "AjetActorRolloutRefWorker",
     "AjetAsyncActorRolloutRefWorker",
-    "AjetActorConfig",
-    "AjetFSDPActorConfig",
     "AjetDataParallelPPOActor",
 ]
@@ -1,47 +1,8 @@
-# Copyright 2025 Alibaba Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Ajet extensions for verl ActorConfig.
-Adds `override_ppo_mini_batch_num` field to control the number of optimizer steps per train-batch-step.
-"""
-
+from verl.workers.config import FSDPActorConfig
 from dataclasses import dataclass, field
-from typing import Optional
-
-from verl.workers.config.actor import ActorConfig, FSDPActorConfig
-
-
-@dataclass
-class AjetActorConfig(ActorConfig):
-    """ActorConfig extended with ajet-specific fields.
-
-    Additional fields:
-        override_ppo_mini_batch_num (Optional[int]): If > 0, overrides ppo_mini_batch_size
-            by computing mini_batch_split_size = ceil(batch_size / override_ppo_mini_batch_num).
-    """
-
-    override_ppo_mini_batch_num: Optional[int] = None
 
 
 @dataclass
-class AjetFSDPActorConfig(FSDPActorConfig):
-    """FSDPActorConfig extended with ajet-specific fields.
-
-    Additional fields:
-        override_ppo_mini_batch_num (Optional[int]): If > 0, overrides ppo_mini_batch_size
-            by computing mini_batch_split_size = ceil(batch_size / override_ppo_mini_batch_num).
-    """
-
-    override_ppo_mini_batch_num: Optional[int] = None
+class AgentJetFSDPActorConfig(FSDPActorConfig):
+    loss_extra_scale_ratio: float = 1.0
+    override_ppo_mini_batch_num: int = 1
@@ -32,7 +32,8 @@
 from verl.utils.fsdp_utils import FSDPModule, fsdp2_clip_grad_norm_
 from verl.utils.profiler import GPUMemoryLogger
 from verl.utils.py_functional import append_to_dict
-from verl.utils.seqlen_balancing import prepare_dynamic_batch
+# ajet/backbone/verl/seqlen_balancing.py
+from ajet.backbone.verl.seqlen_balancing import prepare_dynamic_batch, restore_dynamic_batch
 from verl.workers.actor.dp_actor import DataParallelPPOActor
 
 __all__ = ["AjetDataParallelPPOActor"]
@@ -46,8 +47,94 @@ class AjetDataParallelPPOActor(DataParallelPPOActor):
 
     1. Supports `override_ppo_mini_batch_num` to control the number of optimizer steps per train-batch-step.
     2. Adds debug print for tensor shapes during training.
+    3. Override `prepare_dynamic_batch`
     """
 
+    @GPUMemoryLogger(role="dp actor", logger=logger)
+    def compute_log_prob(self, data: DataProto, calculate_entropy: bool = False) -> dict[str, torch.Tensor]:
+        """Compute the log probability of the responses given input_ids, attention_mask and position_ids
+
+        Args:
+            data (DataProto): a DataProto containing keys
+
+                ``input_ids``: tensor of shape [batch_size, sequence_length]. torch.int64. Note that input_ids is the
+                concatenation of prompt and response. Note that ``sequence_length = prompt_length + response_length``.
+
+                ``attention_mask``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+                ``position_ids``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+                ``responses``:  tensor of shape [batch_size, response_length]. torch.int64.
+
+        Returns:
+            dict[str, torch.Tensor]: a dict containing keys
+                - ``log_probs``: tensor of shape [batch_size, response_length]. torch.float32.
+                - ``entropys``: tensor of shape [batch_size, response_length]. torch.float32.
+                - ``sum_pi_squared``: tensor of shape [batch_size, response_length]. torch.float32.
+        """
+        calculate_sum_pi_squared = self.config.get("calculate_sum_pi_squared", False)
+        self.actor_module.eval()
+
+        micro_batch_size = data.meta_info["micro_batch_size"]
+        temperature = data.meta_info["temperature"]  # temperature must be in the data.meta_info to avoid silent error
+        use_dynamic_bsz = data.meta_info["use_dynamic_bsz"]
+        pad_token_id = data.meta_info.get("pad_token_id", 0)
+        has_multi_modal_inputs = "multi_modal_inputs" in data.non_tensor_batch.keys()
+
+        select_keys = ["responses", "input_ids", "attention_mask", "position_ids"]
+        non_tensor_select_keys = ["multi_modal_inputs"] if has_multi_modal_inputs else []
+        if self.use_prefix_grouper:
+            select_keys += [k for k in ["prompts", "response_mask"] if k in data.batch]
+            if "uid" in data.non_tensor_batch:
+                non_tensor_select_keys.append("uid")
+
+        data = data.select(batch_keys=select_keys, non_tensor_batch_keys=non_tensor_select_keys)
+
+        if use_dynamic_bsz:
+            max_token_len = data.meta_info["max_token_len"] * self.ulysses_sequence_parallel_size
+            micro_batches, batch_idx_list = prepare_dynamic_batch(data, max_token_len=max_token_len)
+        else:
+            micro_batches = data.split(micro_batch_size)
+
+        log_probs_lst = []
+        entropy_lst = []
+        sum_pi_squared_lst = []
+        print(f"len(micro_batches) = {len(micro_batches)}")
+        for micro_batch in micro_batches:
+            micro_batch = micro_batch.to(get_device_id())
+            model_inputs = {**micro_batch.batch, **micro_batch.non_tensor_batch, "pad_token_id": pad_token_id}
+            with torch.no_grad():
+                outputs = self._forward_micro_batch(
+                    model_inputs, temperature=temperature, calculate_entropy=calculate_entropy
+                )
+            log_probs_lst.append(outputs["log_probs"])
+            if calculate_entropy:
+                entropy_lst.append(outputs["entropys"])
+            if calculate_sum_pi_squared:
+                sum_pi_squared_lst.append(outputs["sum_pi_squared"])
+
+        log_probs = torch.concat(log_probs_lst, dim=0)
+        if calculate_entropy:
+            entropys = torch.concat(entropy_lst, dim=0)
+        if calculate_sum_pi_squared:
+            sum_pi_squared = torch.concat(sum_pi_squared_lst, dim=0)
+
+        if use_dynamic_bsz:
+            log_probs = restore_dynamic_batch(log_probs, batch_idx_list)
+            if calculate_entropy:
+                entropys = restore_dynamic_batch(entropys, batch_idx_list)
+            if calculate_sum_pi_squared:
+                sum_pi_squared = restore_dynamic_batch(sum_pi_squared, batch_idx_list)
+
+        outputs = {"log_probs": log_probs}
+        if calculate_entropy:
+            outputs["entropys"] = entropys
+        if calculate_sum_pi_squared:
+            outputs["sum_pi_squared"] = sum_pi_squared
+        return outputs
+
+
+
     @GPUMemoryLogger(role="dp actor", logger=logger)
     def update_policy(self, data: DataProto):
         # make sure we are in training mode
 
@@ -283,7 +283,7 @@ def __init__(self, config: DictConfig, role: str, **kwargs):
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
-        # from verl.workers.actor import DataParallelPPOActor
+        # [AgentJet Change]: use the custom DataParallelPPOActor which supports FSDP and other features needed for ActorRolloutRefWorker
         from ajet.backbone.verl.dp_actor import AjetDataParallelPPOActor as DataParallelPPOActor
 
         # This is used to import external_lib into the huggingface systems
@@ -347,7 +347,8 @@ def init_model(self):
                 log_gpu_memory_usage("After offload actor optimizer during init", logger=logger)
 
         if self._is_actor:
-            actor_cfg = self.config.actor
+            # [AgentJet Change]: use the custom DataParallelPPOActor which supports FSDP and other features needed for ActorRolloutRefWorker
+            actor_cfg = omega_conf_to_dataclass(self.config.actor)
             self.actor = DataParallelPPOActor(
                 config=actor_cfg, actor_module=self.actor_module_fsdp, actor_optimizer=self.actor_optimizer
             )
@@ -422,7 +423,6 @@ def init_model(self):
         # Free cached GPU memory so colocated vLLM processes can see it via cudaMemGetInfo
         aggressive_empty_cache(force_sync=True)
 
-
 # ================================= Async related workers =================================
 class AjetAsyncActorRolloutRefWorker(AjetActorRolloutRefWorker):
     @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)