ray-project · ericl · Apr 20, 2018 · Mar 14, 2018 · Mar 16, 2018 · Mar 16, 2018
@@ -9,7 +9,8 @@
 
 def _register_all():
     for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
-                "__fake", "__sigmoid_fake_data", "__parameter_tuning"]:
+                "DDPG2", "APEX_DDPG2", "__fake", "__sigmoid_fake_data",
+                "__parameter_tuning"]:
         from ray.rllib.agent import get_agent_class
         register_trainable(key, get_agent_class(key))
 

@@ -222,7 +222,13 @@ def _train(self):
 def get_agent_class(alg):
     """Returns the class of a known agent given its name."""
 
-    if alg == "PPO":
+    if alg == "DDPG2":
+        from ray.rllib import ddpg2
+        return ddpg2.DDPG2Agent
+    elif alg == "APEX_DDPG2":
+        from ray.rllib import ddpg2
+        return ddpg2.ApexDDPG2Agent
+    elif alg == "PPO":
         from ray.rllib import ppo
         return ppo.PPOAgent
     elif alg == "ES":

@@ -0,0 +1 @@
+Code in this package follows the style of dqn.
@@ -0,0 +1,8 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.ddpg2.apex import ApexDDPG2Agent
+from ray.rllib.ddpg2.ddpg import DDPG2Agent, DEFAULT_CONFIG
+
+__all__ = ["DDPG2Agent", "ApexDDPG2Agent", "DEFAULT_CONFIG"]
@@ -0,0 +1,47 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.ddpg2.ddpg import DDPG2Agent, DEFAULT_CONFIG as DDPG_CONFIG
+
+APEX_DDPG_DEFAULT_CONFIG = dict(DDPG_CONFIG,
+                                **dict(
+                                    optimizer_class="ApexOptimizer",
+                                    optimizer_config=dict(
+                                        DDPG_CONFIG["optimizer_config"],
+                                        **dict(
+                                            max_weight_sync_delay=400,
+                                            num_replay_buffer_shards=4,
+                                            debug=False,
+                                        )),
+                                    n_step=3,
+                                    num_workers=32,
+                                    buffer_size=2000000,
+                                    learning_starts=50000,
+                                    train_batch_size=512,
+                                    sample_batch_size=50,
+                                    max_weight_sync_delay=400,
+                                    target_network_update_freq=500000,
+                                    timesteps_per_iteration=25000,
+                                    per_worker_exploration=True,
+                                    worker_side_prioritization=True,
+                                ))
+
+
+class ApexDDPG2Agent(DDPG2Agent):
+    """DDPG variant that uses the Ape-X distributed policy optimizer.
+
+    By default, this is configured for a large single node (32 cores). For
+    running in a large cluster, increase the `num_workers` config var.
+    """
+
+    _agent_name = "APEX_DDPG"
+    _default_config = APEX_DDPG_DEFAULT_CONFIG
+
+    def update_target_if_needed(self):
+        # Ape-X updates based on num steps trained, not sampled
+        if self.optimizer.num_steps_trained - self.last_target_update_ts > \
+                self.config["target_network_update_freq"]:
+            self.local_evaluator.update_target()
+            self.last_target_update_ts = self.optimizer.num_steps_trained
+            self.num_target_updates += 1
@@ -0,0 +1,268 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+import os
+
+import numpy as np
+import tensorflow as tf
+
+import ray
+from ray.rllib import optimizers
+from ray.rllib.ddpg2.ddpg_evaluator import DDPGEvaluator
+from ray.rllib.agent import Agent
+from ray.tune.result import TrainingResult
+
+OPTIMIZER_SHARED_CONFIGS = [
+    "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
+    "prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
+    "train_batch_size", "learning_starts", "clip_rewards"
+]
+
+DEFAULT_CONFIG = dict(
+    # === Model ===
+    # Hidden layer sizes of the policy networks
+    actor_hiddens=[64, 64],
+    # Hidden layer sizes of the policy networks
+    critic_hiddens=[64, 64],
+    # N-step Q learning
+    n_step=1,
+    # Config options to pass to the model constructor
+    model={},
+    # Discount factor for the MDP
+    gamma=0.99,
+    # Arguments to pass to the env creator
+    env_config={},
+
+    # === Exploration ===
+    # Max num timesteps for annealing schedules. Exploration is annealed from
+    # 1.0 to exploration_fraction over this number of timesteps scaled by
+    # exploration_fraction
+    schedule_max_timesteps=100000,
+    # Number of env steps to optimize for before returning
+    timesteps_per_iteration=1000,
+    # Fraction of entire training period over which the exploration rate is
+    # annealed
+    exploration_fraction=0.1,
+    # Final value of random action probability
+    exploration_final_eps=0.02,
+    # OU-noise scale
+    noise_scale=0.1,
+    # theta
+    exploration_theta=0.15,
+    # sigma
+    exploration_sigma=0.2,
+    # Update the target network every `target_network_update_freq` steps.
+    target_network_update_freq=0,
+    # Update the target by \tau * policy + (1-\tau) * target_policy
+    tau=0.002,
+    # Whether to start with random actions instead of noops.
+    random_starts=True,
+
+    # === Replay buffer ===
+    # Size of the replay buffer. Note that if async_updates is set, then
+    # each worker will have a replay buffer of this size.
+    buffer_size=50000,
+    # If True prioritized replay buffer will be used.
+    prioritized_replay=True,
+    # Alpha parameter for prioritized replay buffer.
+    prioritized_replay_alpha=0.6,
+    # Beta parameter for sampling from prioritized replay buffer.
+    prioritized_replay_beta=0.4,
+    # Epsilon to add to the TD errors when updating priorities.
+    prioritized_replay_eps=1e-6,
+    # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
+    clip_rewards=True,
+
+    # === Optimization ===
+    # Learning rate for adam optimizer
+    actor_lr=1e-4,
+    critic_lr=1e-3,
+    # If True, use huber loss instead of squared loss for critic network
+    # Conventionally, no need to clip gradients if using a huber loss
+    use_huber=False,
+    # Threshold of a huber loss
+    huber_threshold=1.0,
+    # Weights for L2 regularization
+    l2_reg=1e-6,
+    # If not None, clip gradients during optimization at this value
+    grad_norm_clipping=None,
+    # How many steps of the model to sample before learning starts.
+    learning_starts=1500,
+    # Update the replay buffer with this many samples at once. Note that this
+    # setting applies per-worker if num_workers > 1.
+    sample_batch_size=1,
+    # Size of a batched sampled from replay buffer for training. Note that
+    # if async_updates is set, then each worker returns gradients for a
+    # batch of this size.
+    train_batch_size=256,
+    # Smooth the current average reward over this many previous episodes.
+    smoothing_num_episodes=100,
+
+    # === Tensorflow ===
+    # Arguments to pass to tensorflow
+    tf_session_args={
+        "device_count": {
+            "CPU": 2
+        },
+        "log_device_placement": False,
+        "allow_soft_placement": True,
+        "gpu_options": {
+            "allow_growth": True
+        },
+        "inter_op_parallelism_threads": 1,
+        "intra_op_parallelism_threads": 1,
+    },
+
+    # === Parallelism ===
+    # Number of workers for collecting samples with. This only makes sense
+    # to increase if your environment is particularly slow to sample, or if
+    # you're using the Async or Ape-X optimizers.
+    num_workers=0,
+    # Whether to allocate GPUs for workers (if > 0).
+    num_gpus_per_worker=0,
+    # Optimizer class to use.
+    optimizer_class="LocalSyncReplayOptimizer",
+    # Config to pass to the optimizer.
+    optimizer_config=dict(),
+    # Whether to use a distribution of epsilons across workers for exploration.
+    per_worker_exploration=False,
+    # Whether to compute priorities on workers.
+    worker_side_prioritization=False)
+
+
+class DDPG2Agent(Agent):
+    _agent_name = "DDPG2"
+    _allow_unknown_subkeys = [
+        "model", "optimizer", "tf_session_args", "env_config"
+    ]
+    _default_config = DEFAULT_CONFIG
+
+    def _init(self):
+        self.local_evaluator = DDPGEvaluator(self.registry, self.env_creator,
+                                             self.config, self.logdir, 0)
+        remote_cls = ray.remote(
+            num_cpus=1,
+            num_gpus=self.config["num_gpus_per_worker"])(DDPGEvaluator)
+        self.remote_evaluators = [
+            remote_cls.remote(self.registry, self.env_creator, self.config,
+                              self.logdir, i)
+            for i in range(self.config["num_workers"])
+        ]
+
+        for k in OPTIMIZER_SHARED_CONFIGS:
+            if k not in self.config["optimizer_config"]:
+                self.config["optimizer_config"][k] = self.config[k]
+
+        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
+            self.config["optimizer_config"], self.local_evaluator,
+            self.remote_evaluators)
+
+        self.saver = tf.train.Saver(max_to_keep=None)
+        self.last_target_update_ts = 0
+        self.num_target_updates = 0
+
+    @property
+    def global_timestep(self):
+        return self.optimizer.num_steps_sampled
+
+    def update_target_if_needed(self):
+        if self.global_timestep - self.last_target_update_ts > \
+                self.config["target_network_update_freq"]:
+            self.local_evaluator.update_target()
+            self.last_target_update_ts = self.global_timestep
+            self.num_target_updates += 1
+
+    def _train(self):
+        start_timestep = self.global_timestep
+
+        while (self.global_timestep - start_timestep <
+               self.config["timesteps_per_iteration"]):
+
+            self.optimizer.step()
+            self.update_target_if_needed()
+
+        self.local_evaluator.set_global_timestep(self.global_timestep)
+        for e in self.remote_evaluators:
+            e.set_global_timestep.remote(self.global_timestep)
+
+        return self._train_stats(start_timestep)
+
+    def _train_stats(self, start_timestep):
+        if self.remote_evaluators:
+            stats = ray.get([e.stats.remote() for e in self.remote_evaluators])
+        else:
+            stats = self.local_evaluator.stats()
+            if not isinstance(stats, list):
+                stats = [stats]
+
+        mean_100ep_reward = 0.0
+        mean_100ep_length = 0.0
+        num_episodes = 0
+        explorations = []
+
+        if self.config["per_worker_exploration"]:
+            # Return stats from workers with the lowest 20% of exploration
+            test_stats = stats[-int(max(1, len(stats) * 0.2)):]
+        else:
+            test_stats = stats
+
+        for s in test_stats:
+            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
+            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
+
+        for s in stats:
+            num_episodes += s["num_episodes"]
+            explorations.append(s["exploration"])
+
+        opt_stats = self.optimizer.stats()
+
+        result = TrainingResult(
+            episode_reward_mean=mean_100ep_reward,
+            episode_len_mean=mean_100ep_length,
+            episodes_total=num_episodes,
+            timesteps_this_iter=self.global_timestep - start_timestep,
+            info=dict({
+                "min_exploration": min(explorations),
+                "max_exploration": max(explorations),
+                "num_target_updates": self.num_target_updates,
+            }, **opt_stats))
+
+        return result
+
+    def _stop(self):
+        # workaround for https://github.com/ray-project/ray/issues/1516
+        for ev in self.remote_evaluators:
+            ev.__ray_terminate__.remote(ev._ray_actor_id.id())
+
+    def _save(self, checkpoint_dir):
+        checkpoint_path = self.saver.save(
+            self.local_evaluator.sess,
+            os.path.join(checkpoint_dir, "checkpoint"),
+            global_step=self.iteration)
+        extra_data = [
+            self.local_evaluator.save(),
+            ray.get([e.save.remote() for e in self.remote_evaluators]),
+            self.optimizer.save(), self.num_target_updates,
+            self.last_target_update_ts
+        ]
+        pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
+        return checkpoint_path
+
+    def _restore(self, checkpoint_path):
+        self.saver.restore(self.local_evaluator.sess, checkpoint_path)
+        extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
+        self.local_evaluator.restore(extra_data[0])
+        ray.get([
+            e.restore.remote(d)
+            for (d, e) in zip(extra_data[1], self.remote_evaluators)
+        ])
+        self.optimizer.restore(extra_data[2])
+        self.num_target_updates = extra_data[3]
+        self.last_target_update_ts = extra_data[4]
+
+    def compute_action(self, observation):
+        return self.local_evaluator.ddpg_graph.act(self.local_evaluator.sess,
+                                                   np.array(observation)[None],
+                                                   0.0)[0]