Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4ebca67
ongoing ddpg
joneswong Mar 14, 2018
15c2f01
ongoing ddpg converged
joneswong Mar 16, 2018
795c82c
gpu machine changes
joneswong Mar 16, 2018
841891d
tuned
joneswong Mar 19, 2018
8add6c3
sync with the latest ray-porject
joneswong Mar 28, 2018
9a29df5
tuned ddpg specification
joneswong Mar 28, 2018
2981a02
merge the latest ray
joneswong Mar 29, 2018
95a3e86
ddpg
joneswong Apr 5, 2018
39a4537
supplement missed optimizer argument clip_rewards in default DQN con…
joneswong Apr 8, 2018
125b0e3
ddpg supports vision env (atari) now
joneswong Apr 8, 2018
24dfa67
Merge branch 'master' into dev_jones
joneswong Apr 9, 2018
040ebc7
Merge branch 'master' into dev_jones_contrib
joneswong Apr 9, 2018
559b6ee
revised according to code review comments
joneswong Apr 11, 2018
4b97522
merge this contribution
joneswong Apr 11, 2018
16bfaef
added regression test case
joneswong Apr 11, 2018
bcebaac
removed irrelevant files
joneswong Apr 11, 2018
2bceb38
validate ddpg on mountain_car_continuous
joneswong Apr 11, 2018
53e59b9
Merge branch 'dev_jones'
joneswong Apr 11, 2018
763f495
restore unnecessary slight changes
joneswong Apr 11, 2018
3550143
revised according to eric's comments
joneswong Apr 12, 2018
f595276
added the requested tests
joneswong Apr 12, 2018
4ce455d
revised accordingly
joneswong Apr 13, 2018
c525014
revised accordingly and re-validated
joneswong Apr 13, 2018
8336763
Merge branch 'master' into master
ericl Apr 16, 2018
abe1f69
formatted by yapf
joneswong Apr 18, 2018
137dd98
Merge branch 'master' of https://github.com/AlibabaPAI/ray
joneswong Apr 18, 2018
950e756
fix lint errors
joneswong Apr 19, 2018
da37284
formatted by yapf
joneswong Apr 19, 2018
5055453
fix lint errors
joneswong Apr 19, 2018
859adf4
formatted by yapf
joneswong Apr 19, 2018
3c11046
fix lint errors
joneswong Apr 19, 2018
fc8932b
fix lint error
joneswong Apr 19, 2018
0aaba16
Merge branch 'dev_jones'
joneswong Apr 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/ray/rllib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

def _register_all():
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
"__fake", "__sigmoid_fake_data", "__parameter_tuning"]:
"DDPG2", "APEX_DDPG2", "__fake", "__sigmoid_fake_data",
"__parameter_tuning"]:
from ray.rllib.agent import get_agent_class
register_trainable(key, get_agent_class(key))

Expand Down
8 changes: 7 additions & 1 deletion python/ray/rllib/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,13 @@ def _train(self):
def get_agent_class(alg):
"""Returns the class of a known agent given its name."""

if alg == "PPO":
if alg == "DDPG2":
from ray.rllib import ddpg2
return ddpg2.DDPG2Agent
elif alg == "APEX_DDPG2":
from ray.rllib import ddpg2
return ddpg2.ApexDDPG2Agent
elif alg == "PPO":
from ray.rllib import ppo
return ppo.PPOAgent
elif alg == "ES":
Expand Down
1 change: 1 addition & 0 deletions python/ray/rllib/ddpg2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Code in this package follows the style of dqn.
8 changes: 8 additions & 0 deletions python/ray/rllib/ddpg2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ray.rllib.ddpg2.apex import ApexDDPG2Agent
from ray.rllib.ddpg2.ddpg import DDPG2Agent, DEFAULT_CONFIG

__all__ = ["DDPG2Agent", "ApexDDPG2Agent", "DEFAULT_CONFIG"]
47 changes: 47 additions & 0 deletions python/ray/rllib/ddpg2/apex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ray.rllib.ddpg2.ddpg import DDPG2Agent, DEFAULT_CONFIG as DDPG_CONFIG

APEX_DDPG_DEFAULT_CONFIG = dict(DDPG_CONFIG,
**dict(
optimizer_class="ApexOptimizer",
optimizer_config=dict(
DDPG_CONFIG["optimizer_config"],
**dict(
max_weight_sync_delay=400,
num_replay_buffer_shards=4,
debug=False,
)),
n_step=3,
num_workers=32,
buffer_size=2000000,
learning_starts=50000,
train_batch_size=512,
sample_batch_size=50,
max_weight_sync_delay=400,
target_network_update_freq=500000,
timesteps_per_iteration=25000,
per_worker_exploration=True,
worker_side_prioritization=True,
))


class ApexDDPG2Agent(DDPG2Agent):
"""DDPG variant that uses the Ape-X distributed policy optimizer.

By default, this is configured for a large single node (32 cores). For
running in a large cluster, increase the `num_workers` config var.
"""

_agent_name = "APEX_DDPG"
_default_config = APEX_DDPG_DEFAULT_CONFIG

def update_target_if_needed(self):
# Ape-X updates based on num steps trained, not sampled
if self.optimizer.num_steps_trained - self.last_target_update_ts > \
self.config["target_network_update_freq"]:
self.local_evaluator.update_target()
self.last_target_update_ts = self.optimizer.num_steps_trained
self.num_target_updates += 1
Empty file.
268 changes: 268 additions & 0 deletions python/ray/rllib/ddpg2/ddpg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pickle
import os

import numpy as np
import tensorflow as tf

import ray
from ray.rllib import optimizers
from ray.rllib.ddpg2.ddpg_evaluator import DDPGEvaluator
from ray.rllib.agent import Agent
from ray.tune.result import TrainingResult

OPTIMIZER_SHARED_CONFIGS = [
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
"prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
"train_batch_size", "learning_starts", "clip_rewards"
]

DEFAULT_CONFIG = dict(
# === Model ===
# Hidden layer sizes of the policy networks
actor_hiddens=[64, 64],
# Hidden layer sizes of the policy networks
critic_hiddens=[64, 64],
# N-step Q learning
n_step=1,
# Config options to pass to the model constructor
model={},
# Discount factor for the MDP
gamma=0.99,
# Arguments to pass to the env creator
env_config={},

# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
# exploration_fraction
schedule_max_timesteps=100000,
# Number of env steps to optimize for before returning
timesteps_per_iteration=1000,
# Fraction of entire training period over which the exploration rate is
# annealed
exploration_fraction=0.1,
# Final value of random action probability
exploration_final_eps=0.02,
# OU-noise scale
noise_scale=0.1,
# theta
exploration_theta=0.15,
# sigma
exploration_sigma=0.2,
# Update the target network every `target_network_update_freq` steps.
target_network_update_freq=0,
# Update the target by \tau * policy + (1-\tau) * target_policy
tau=0.002,
# Whether to start with random actions instead of noops.
random_starts=True,

# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
buffer_size=50000,
# If True prioritized replay buffer will be used.
prioritized_replay=True,
# Alpha parameter for prioritized replay buffer.
prioritized_replay_alpha=0.6,
# Beta parameter for sampling from prioritized replay buffer.
prioritized_replay_beta=0.4,
# Epsilon to add to the TD errors when updating priorities.
prioritized_replay_eps=1e-6,
# Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
clip_rewards=True,

# === Optimization ===
# Learning rate for adam optimizer
actor_lr=1e-4,
critic_lr=1e-3,
# If True, use huber loss instead of squared loss for critic network
# Conventionally, no need to clip gradients if using a huber loss
use_huber=False,
# Threshold of a huber loss
huber_threshold=1.0,
# Weights for L2 regularization
l2_reg=1e-6,
# If not None, clip gradients during optimization at this value
grad_norm_clipping=None,
# How many steps of the model to sample before learning starts.
learning_starts=1500,
# Update the replay buffer with this many samples at once. Note that this
# setting applies per-worker if num_workers > 1.
sample_batch_size=1,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
train_batch_size=256,
# Smooth the current average reward over this many previous episodes.
smoothing_num_episodes=100,

# === Tensorflow ===
# Arguments to pass to tensorflow
tf_session_args={
"device_count": {
"CPU": 2
},
"log_device_placement": False,
"allow_soft_placement": True,
"gpu_options": {
"allow_growth": True
},
"inter_op_parallelism_threads": 1,
"intra_op_parallelism_threads": 1,
},

# === Parallelism ===
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you're using the Async or Ape-X optimizers.
num_workers=0,
# Whether to allocate GPUs for workers (if > 0).
num_gpus_per_worker=0,
# Optimizer class to use.
optimizer_class="LocalSyncReplayOptimizer",
# Config to pass to the optimizer.
optimizer_config=dict(),
# Whether to use a distribution of epsilons across workers for exploration.
per_worker_exploration=False,
# Whether to compute priorities on workers.
worker_side_prioritization=False)


class DDPG2Agent(Agent):
_agent_name = "DDPG2"
_allow_unknown_subkeys = [
"model", "optimizer", "tf_session_args", "env_config"
]
_default_config = DEFAULT_CONFIG

def _init(self):
self.local_evaluator = DDPGEvaluator(self.registry, self.env_creator,
self.config, self.logdir, 0)
remote_cls = ray.remote(
num_cpus=1,
num_gpus=self.config["num_gpus_per_worker"])(DDPGEvaluator)
self.remote_evaluators = [
remote_cls.remote(self.registry, self.env_creator, self.config,
self.logdir, i)
for i in range(self.config["num_workers"])
]

for k in OPTIMIZER_SHARED_CONFIGS:
if k not in self.config["optimizer_config"]:
self.config["optimizer_config"][k] = self.config[k]

self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
self.config["optimizer_config"], self.local_evaluator,
self.remote_evaluators)

self.saver = tf.train.Saver(max_to_keep=None)
self.last_target_update_ts = 0
self.num_target_updates = 0

@property
def global_timestep(self):
return self.optimizer.num_steps_sampled

def update_target_if_needed(self):
if self.global_timestep - self.last_target_update_ts > \
self.config["target_network_update_freq"]:
self.local_evaluator.update_target()
self.last_target_update_ts = self.global_timestep
self.num_target_updates += 1

def _train(self):
start_timestep = self.global_timestep

while (self.global_timestep - start_timestep <
self.config["timesteps_per_iteration"]):

self.optimizer.step()
self.update_target_if_needed()

self.local_evaluator.set_global_timestep(self.global_timestep)
for e in self.remote_evaluators:
e.set_global_timestep.remote(self.global_timestep)

return self._train_stats(start_timestep)

def _train_stats(self, start_timestep):
if self.remote_evaluators:
stats = ray.get([e.stats.remote() for e in self.remote_evaluators])
else:
stats = self.local_evaluator.stats()
if not isinstance(stats, list):
stats = [stats]

mean_100ep_reward = 0.0
mean_100ep_length = 0.0
num_episodes = 0
explorations = []

if self.config["per_worker_exploration"]:
# Return stats from workers with the lowest 20% of exploration
test_stats = stats[-int(max(1, len(stats) * 0.2)):]
else:
test_stats = stats

for s in test_stats:
mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
mean_100ep_length += s["mean_100ep_length"] / len(test_stats)

for s in stats:
num_episodes += s["num_episodes"]
explorations.append(s["exploration"])

opt_stats = self.optimizer.stats()

result = TrainingResult(
episode_reward_mean=mean_100ep_reward,
episode_len_mean=mean_100ep_length,
episodes_total=num_episodes,
timesteps_this_iter=self.global_timestep - start_timestep,
info=dict({
"min_exploration": min(explorations),
"max_exploration": max(explorations),
"num_target_updates": self.num_target_updates,
}, **opt_stats))

return result

def _stop(self):
# workaround for https://github.com/ray-project/ray/issues/1516
for ev in self.remote_evaluators:
ev.__ray_terminate__.remote(ev._ray_actor_id.id())

def _save(self, checkpoint_dir):
checkpoint_path = self.saver.save(
self.local_evaluator.sess,
os.path.join(checkpoint_dir, "checkpoint"),
global_step=self.iteration)
extra_data = [
self.local_evaluator.save(),
ray.get([e.save.remote() for e in self.remote_evaluators]),
self.optimizer.save(), self.num_target_updates,
self.last_target_update_ts
]
pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
return checkpoint_path

def _restore(self, checkpoint_path):
self.saver.restore(self.local_evaluator.sess, checkpoint_path)
extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
self.local_evaluator.restore(extra_data[0])
ray.get([
e.restore.remote(d)
for (d, e) in zip(extra_data[1], self.remote_evaluators)
])
self.optimizer.restore(extra_data[2])
self.num_target_updates = extra_data[3]
self.last_target_update_ts = extra_data[4]

def compute_action(self, observation):
return self.local_evaluator.ddpg_graph.act(self.local_evaluator.sess,
np.array(observation)[None],
0.0)[0]
Loading