trl/examples/scripts/openenv/echo.py at main · huggingface/trl

133 lines (106 loc) · 3.78 KB
# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# /// script
# dependencies = [
#     "trl",
#     "openenv-echo-env @ git+https://huggingface.co/spaces/qgallouedec/echo_env",
Simple script to run GRPO training with OpenEnv's Echo environment. The environment echoes back the message
sent to it and rewards longer completions.
Setup (Option A - Install from HF Space, recommended):
uv pip install git+https://huggingface.co/spaces/qgallouedec/echo_env
Setup (Option B - Clone OpenEnv repo, for development):
git clone https://github.com/huggingface/OpenEnv.git
cd OpenEnv/envs/echo_env
uv pip install -e .
python examples/scripts/openenv/echo.py
python examples/scripts/openenv/echo.py --model Qwen/Qwen2.5-0.5B-Instruct --env-host https://qgallouedec-echo-env.hf.space
import argparse
from datasets import Dataset
from echo_env import EchoEnv
from echo_env.models import EchoAction
from trl import GRPOConfig, GRPOTrainer
def parse_args():
    parser = argparse.ArgumentParser(description="Run GRPO training with Echo environment.")
    parser.add_argument(
        "--model",
        type=str,
        default="Qwen/Qwen3-0.6B",
        help="Model to use for training.",
    parser.add_argument(
        "--env-host",
        type=str,
        default="https://qgallouedec-echo-env.hf.space",
        help="URL for the Echo environment HF Space.",
    return parser.parse_args()
def reward_func(environments, **kwargs):
    return [env.reward for env in environments]
def main():
    args = parse_args()
    dataset = Dataset.from_dict(
            "prompt": [
                [{"role": "user", "content": "Try to echo 'Hello World!' in the environment."}],
                [{"role": "user", "content": "Make the environment echo 'Goodbye World!'"}],
                [{"role": "user", "content": "Can you ask the environment to echo 'TRL is great!'?"}],
                [{"role": "user", "content": "What happens if you ask the environment to echo 'I love RLHF!'?"}],
                [{"role": "user", "content": "Try to make the environment echo 'OpenEnv is awesome!'"}],
    class EchoToolEnv:
        def __init__(self):
            self.env = EchoEnv(base_url=args.env_host)
            self.reward = 0.0
        def reset(self, **kwargs) -> None | str:
            self.reward = 0.0
            return None
        def echo(self, message: str) -> str:
            Echo the message back from the environment.
            Args:
                message: The message to echo
            Returns:
                The echoed message.
            observation = self.env.step(EchoAction(message=message))
            self.reward = observation.observation.reward
            return observation.observation.echoed_message
    trainer = GRPOTrainer(
        model=args.model,
        train_dataset=dataset,
        reward_funcs=reward_func,
        args=GRPOConfig(
            chat_template_kwargs={"enable_thinking": False},
            log_completions=True,
            logging_steps=2,
            num_completions_to_print=1,
        environment_factory=EchoToolEnv,
    trainer.train()
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

echo.py

Latest commit

History

echo.py

File metadata and controls