ray-project · simonsays1980 · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025
@@ -872,7 +872,7 @@ def setup(self, config: AlgorithmConfig) -> None:
                 self.offline_eval_runner_group: OfflineEvaluationRunnerGroup = OfflineEvaluationRunnerGroup(
                     config=self.evaluation_config,
                     # Do not create a local runner such that the dataset can be split.
-                    local_runner=False,
+                    local_runner=self.config.num_offline_eval_runners == 0,
                     # Provide the `RLModule`'s state for the `OfflinePreLearner`s.
                     module_state=rl_module_state[COMPONENT_RL_MODULE],
                     module_spec=module_spec,
@@ -1134,10 +1134,13 @@ def evaluate_offline(self):
         )
 
         # Evaluate with fixed duration.
-        self._evaluate_offline_with_fixed_duration()
+        if self.offline_eval_runner_group.num_healthy_remote_runners > 0:
+            self._evaluate_offline_with_fixed_duration()
+        else:
+            self._evaluate_offline_on_local_runner()
         # Reduce the evaluation results.
         eval_results = self.metrics.peek(
-            ("EVALUATION_RESULTS", "OFFLINE_EVAL_RUNNER_RESULTS"), default={}
+            (EVALUATION_RESULTS, OFFLINE_EVAL_RUNNER_RESULTS), default={}
         )
 
         # Trigger `on_evaluate_offline_end` callback.
@@ -1153,7 +1156,7 @@ def evaluate_offline(self):
         )
 
         # Also return the results here for convenience.
-        return {EVALUATION_RESULTS: {OFFLINE_EVAL_RUNNER_RESULTS: eval_results}}
+        return {OFFLINE_EVAL_RUNNER_RESULTS: eval_results}
 
     @PublicAPI
     def evaluate(
@@ -1363,6 +1366,38 @@ def _evaluate_with_custom_eval_function(self) -> Tuple[ResultDict, int, int]:
 
         return eval_results, env_steps, agent_steps
 
+    def _evaluate_offline_on_local_runner(self):
+        # if hasattr(env_runner, "input_reader") and env_runner.input_reader is None:
+        #     raise ValueError(
+        #         "Can't evaluate on a local worker if this local worker does not have "
+        #         "an environment!\nTry one of the following:"
+        #         "\n1) Set `evaluation_interval` > 0 to force creating a separate "
+        #         "evaluation EnvRunnerGroup.\n2) Set `create_local_env_runner=True` to "
+        #         "force the local (non-eval) EnvRunner to have an environment to "
+        #         "evaluate on."
+        #     )
+        # elif self.config.evaluation_parallel_to_training:
+        #     raise ValueError(
+        #         "Cannot run on local evaluation worker parallel to training! Try "
+        #         "setting `evaluation_parallel_to_training=False`."
+        #     )
+
+        # How many episodes/timesteps do we need to run?
+        unit = "batches"
+        duration = (
+            self.config.offline_evaluation_duration
+            * self.config.dataset_num_iters_per_eval_runner
+        )
+
+        logger.info(f"Evaluating current state of {self} for {duration} {unit}.")
+
+        results = self.offline_eval_runner_group.local_runner.run()
+
+        self.metrics.aggregate(
+            [results],
+            key=(EVALUATION_RESULTS, OFFLINE_EVAL_RUNNER_RESULTS),
+        )
+
     def _evaluate_on_local_env_runner(self, env_runner):
         if hasattr(env_runner, "input_reader") and env_runner.input_reader is None:
             raise ValueError(
@@ -1651,6 +1686,8 @@ def _offline_eval_runner_remote(runner, iter):
                 if iter != self.iteration:
                     continue
                 all_metrics.append(met)
+                # Note, the `dataset_num_iters_per_eval_runner` must be smaller than
+                # `offline_evaluation_duration` // `num_offline_eval_runners`.
                 num_units_done += (
                     met[ALL_MODULES][DATASET_NUM_ITERS_EVALUATED].peek()
                     if DATASET_NUM_ITERS_EVALUATED in met[ALL_MODULES]

@@ -533,6 +533,8 @@ def __init__(self, algo_class: Optional[type] = None):
         # Offline evaluation.
         self.offline_evaluation_interval = None
         self.num_offline_eval_runners = 0
+        self.offline_evaluation_type: str = None
+        self.offline_eval_runner_class = None
         # TODO (simon): Only `_offline_evaluate_with_fixed_duration` works. Also,
         # decide, if we use `offline_evaluation_duration` or
         # `dataset_num_iters_per_offline_eval_runner`. Should the user decide here?
@@ -2705,6 +2707,8 @@ def evaluation(
         # Offline evaluation.
         offline_evaluation_interval: Optional[int] = NotProvided,
         num_offline_eval_runners: Optional[int] = NotProvided,
+        offline_evaluation_type: Optional[Callable] = NotProvided,
+        offline_eval_runner_class: Optional[Callable] = NotProvided,
         offline_loss_for_module_fn: Optional[Callable] = NotProvided,
         offline_eval_batch_size_per_runner: Optional[int] = NotProvided,
         dataset_num_iters_per_offline_eval_runner: Optional[int] = NotProvided,
@@ -2829,6 +2833,13 @@ def evaluation(
                 for parallel evaluation. Setting this to 0 forces sampling to be done in the
                 local OfflineEvaluationRunner (main process or the Algorithm's actor when
                 using Tune).
+            offline_evaluation_type: Type of offline evaluation to run. Either `"eval_loss"`
+                for evaluating the validation loss of the policy, `"is"` for importance
+                sampling, or `"pdis"` for per-decision importance sampling. If you want to
+                implement your own offline evaluation method write an `OfflineEvaluationRunner`
+                and use the `AlgorithmConfig.offline_eval_runner_class`.
+            offline_eval_runner_class: An `OfflineEvaluationRunner` class that implements
+                custom offline evaluation logic.
             offline_loss_for_module_fn: A callable to compute the loss per `RLModule` in
                 offline evaluation. If not provided the training loss function (
                 `Learner.compute_loss_for_module`) is used. The signature must be (
@@ -2975,6 +2986,10 @@ def evaluation(
             self.offline_evaluation_interval = offline_evaluation_interval
         if num_offline_eval_runners is not NotProvided:
             self.n
AE96
um_offline_eval_runners = num_offline_eval_runners
+        if offline_evaluation_type is not NotProvided:
+            self.offline_evaluation_type = offline_evaluation_type
+        if offline_eval_runner_class is not NotProvided:
+            self.offline_eval_runner_cls = offline_eval_runner_class
         if offline_loss_for_module_fn is not NotProvided:
             self.offline_loss_for_module_fn = offline_loss_for_module_fn
         if offline_eval_batch_size_per_runner is not NotProvided:
@@ -5282,6 +5297,33 @@ def _validate_offline_settings(self):
                 "recorded episodes cannot be read in for training."
             )
 
+        # Offline evaluation.
+        from ray.rllib.offline.offline_policy_evaluation_runner import (
+            OfflinePolicyEvaluationTypes,
+        )
+
+        offline_eval_types = list(OfflinePolicyEvaluationTypes)
+        if (
+            self.offline_evaluation_type
+            and self.offline_evaluation_type != "eval_loss"
+            and self.offline_evaluation_type not in OfflinePolicyEvaluationTypes
+        ):
+            self._value_error(
+                f"Unknown offline evaluation type: {self.offline_evaluation_type}."
+                "Available types of offline evaluation are either `'eval_loss' to evaluate "
+                f"the training loss on a validation dataset or {offline_eval_types}."
+            )
+
+        from ray.rllib.offline.offline_evaluation_runner import OfflineEvaluationRunner
+
+        if self.prelearner_class and not issubclass(
+            self.prelearner_class, OfflineEvaluationRunner
+        ):
+            self._value_error(
+                "Unknown `offline_eval_runner_class`. OfflineEvaluationRunner class needs to inherit "
+                "from `OfflineEvaluationRunner` class."
+            )
+
     @property
     def is_online(self) -> bool:
         """Defines if this config is for online RL.

@@ -867,11 +867,11 @@ def _log_episode_metrics(self, length, ret, sec):
         self.metrics.log_value(EPISODE_DURATION_SEC_MEAN, sec, window=win)
         # Per-agent returns.
         self.metrics.log_value(
-            ("agent_episode_returns_mean", DEFAULT_AGENT_ID), ret, window=win
+            ("agent_episode_return_mean", DEFAULT_AGENT_ID), ret, window=win
         )
         # Per-RLModule returns.
         self.metrics.log_value(
-            ("module_episode_returns_mean", DEFAULT_MODULE_ID), ret, window=win
+            ("module_episode_return_mean", DEFAULT_MODULE_ID), ret, window=win
         )
 
         # For some metrics, log min/max as well.

@@ -7,8 +7,6 @@
 from ray.data.iterator import DataIterator
 from ray.rllib.core import (
     ALL_MODULES,
-    COMPONENT_ENV_TO_MODULE_CONNECTOR,
-    COMPONENT_MODULE_TO_ENV_CONNECTOR,
     COMPONENT_RL_MODULE,
 )
 from ray.rllib.core.rl_module.apis import SelfSupervisedLossAPI
@@ -64,6 +62,7 @@ def __init__(
         # This has to be defined after we have a `self.config`.
         self._loss_for_module_fn = types.MethodType(self.get_loss_for_module_fn(), self)
 
+    @override(Runner)
     def run(
         self,
         explore: bool = False,
@@ -224,21 +223,14 @@ def get_state(
                 **kwargs,
             )
             state[WEIGHTS_SEQ_NO] = self._weights_seq_no
-        if self._check_component(
-            COMPONENT_ENV_TO_MODULE_CONNECTOR, components, not_components
-        ):
-            state[COMPONENT_ENV_TO_MODULE_CONNECTOR] = self._env_to_module.get_state()
-        if self._check_component(
-            COMPONENT_MODULE_TO_ENV_CONNECTOR, components, not_components
-        ):
-            state[COMPONENT_MODULE_TO_ENV_CONNECTOR] = self._module_to_env.get_state()
 
         return state
 
     def _convert_to_tensor(self, struct) -> TensorType:
         """Converts structs to a framework-specific tensor."""
         return convert_to_torch_tensor(struct)
 
+    @override(Runner)
     def stop(self) -> None:
         """Releases all resources used by this EnvRunner.
 
@@ -247,6 +239,7 @@ def stop(self) -> None:
         """
         pass
 
+    @override(Runner)
     def __del__(self) -> None:
         """If this Actor is deleted, clears all resources used by it."""
         pass
@@ -333,10 +326,6 @@ def compute_eval_loss_for_module(
 
     @override(Checkpointable)
     def set_state(self, state: StateDict) -> None:
-        if COMPONENT_ENV_TO_MODULE_CONNECTOR in state:
-            self._env_to_module.set_state(state[COMPONENT_ENV_TO_MODULE_CONNECTOR])
-        if COMPONENT_MODULE_TO_ENV_CONNECTOR in state:
-            self._module_to_env.set_state(state[COMPONENT_MODULE_TO_ENV_CONNECTOR])
 
         # Update the RLModule state.
         if COMPONENT_RL_MODULE in state:

@@ -7,6 +7,11 @@
 from ray.rllib.env import INPUT_ENV_SPACES
 from ray.rllib.offline.offline_data import OfflineData
 from ray.rllib.offline.offline_evaluation_runner import OfflineEvaluationRunner
+from ray.rllib.offline.offline_policy_evaluation_runner import (
+    OfflinePolicyEvaluationRunner,
+    OfflinePolicyPreEvaluator,
+)
+from ray.rllib.offline.offline_prelearner import OfflinePreLearner
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.runners.runner_group import RunnerGroup
 
@@ -57,6 +62,22 @@ def _setup(
         **kwargs: Dict[str, Any],
     ) -> None:
 
+        # Define the offline evaluation runner class.
+        self._runner_cls = config.offline_eval_runner_class or (
+            OfflineEvaluationRunner
+            if config.offline_evaluation_type == "eval_loss"
+            else OfflinePolicyEvaluationRunner
+        )
+        # Define
+        self._pre_learner_or_evaluator_cls = self.config.prelearner_class or (
+            OfflinePreLearner
+            if config.offline_evaluation_type == "eval_loss"
+            else OfflinePolicyPreEvaluator
+        )
+        self.config._is_frozen = False
+        self.config.prelearner_class = self._pre_learner_or_evaluator_cls
+        self.config._is_frozen = True
+
         # We can either run on a local runner or on remote runners only b/c
         # streaming split needs remote runners.
         if num_runners > 0 and local_runner:
@@ -73,6 +94,8 @@ def _setup(
             # Do not validate until the `DataIterators` are distributed.
             validate=False,
             module_spec=module_spec,
+            module_state=module_state,
+            spaces=spaces,
         )
 
         # Setup the evaluation offline dataset and return an iterator.
@@ -124,7 +147,7 @@ def runner_health_probe_timeout_s(self):
     @property
     def runner_cls(self) -> Callable:
         """Class for each runner."""
-        return OfflineEvaluationRunner
+        return self._runner_cls
 
     @property
     def num_runners(self) -> int: