-
Notifications
You must be signed in to change notification settings - Fork 7.4k
Closed
Labels
P1Issue that should be fixed within a few weeksIssue that should be fixed within a few weeksbugSomething that is supposed to be working; but isn'tSomething that is supposed to be working; but isn'ttuneTune-related issuesTune-related issues
Description
Hi,
What is the problem?
I compiled ray from source from the commit 552ebdb to be able to use the new with_parameters function #10679 to pass constant variables to my trainable class. However, I get the following error when calling with_parameters:
Failure # 1 (occurred at 2020-09-26_16-50-01)
Traceback (most recent call last):
File "/home/karol/PycharmProjects/ray/python/ray/tune/trial_runner.py", line 518, in _process_trial
result = self.trial_executor.fetch_result(trial)
File "/home/karol/PycharmProjects/ray/python/ray/tune/ray_trial_executor.py", line 488, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "/home/karol/PycharmProjects/ray/python/ray/worker.py", line 1438, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): �[36mray::ImplicitFunc.train()�[39m (pid=25150, ip=141.12.239.114)
File "python/ray/_raylet.pyx", line 484, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
File "/home/karol/PycharmProjects/ray/python/ray/tune/trainable.py", line 336, in train
result = self.step()
File "/home/karol/PycharmProjects/ray/python/ray/tune/function_runner.py", line 346, in step
self._report_thread_runner_error(block=True)
File "/home/karol/PycharmProjects/ray/python/ray/tune/function_runner.py", line 464, in _report_thread_runner_error
raise TuneError(("Trial raised an exception. Traceback:\n{}"
ray.tune.error.TuneError: Trial raised an exception. Traceback:
�[36mray::ImplicitFunc.train()�[39m (pid=25150, ip=141.12.239.114)
File "/home/karol/PycharmProjects/ray/python/ray/tune/function_runner.py", line 233, in run
self._entrypoint()
File "/home/karol/PycharmProjects/ray/python/ray/tune/function_runner.py", line 295, in entrypoint
return self._trainable_func(self.config, self._status_reporter,
File "/home/karol/PycharmProjects/ray/python/ray/tune/function_runner.py", line 527, in _trainable_func
output = fn()
File "/home/karol/PycharmProjects/ray/python/ray/tune/function_runner.py", line 595, in _inner
inner(config, checkpoint_dir=None)
File "/home/karol/PycharmProjects/ray/python/ray/tune/function_runner.py", line 589, in inner
fn(config, **fn_kwargs)
TypeError: __init__() got multiple values for argument 'arch'
I am using Ubuntu 18.04.4 LTS (GNU/Linux 5.4.0-42-generic x86_64) with python 3.8.3 and pytorch 1.5.0
Reproduction
from ray import tune
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune.function_runner import with_parameters
ray.init(num_cpus=3, num_gpus=3, _memory=7516192768, object_store_memory=7516192768)
pbt = PopulationBasedTraining(
time_attr="training_iteration",
metric="val_dice",
mode="max",
perturbation_interval=2,
hyperparam_mutations={
"lr": lambda: np.random.uniform(0.001, 0.0001)
})
analysis = tune.run(with_parameters(MyTrainable,
arch="FPN",
encoder=encoder,
train_loader=train_loader,
val_loader=val_loader,
optimizer="adam",
device=device),
scheduler=pbt,
reuse_actors=True,
keep_checkpoints_num=3,
verbose=1,
config={"lr": tune.uniform(0.001, 0.0001)},
num_samples=2,
resources_per_trial={"gpu": 1, "cpu": 1})
class MyTrainable(tune.Trainable):
def __init__(self, arch, encoder, train_loader, val_loader, optimizer="adam", device="cuda"):
super().__init__()
self.train_loader = train_loader
self.val_loader = val_loader
self.device = device
self.model = NoiseSegModel(arch=arch, encoder=encoder)
if str(self.device) != "cpu":
self.model = self.model.cuda()
self.optimizer = optimizer
def _setup(self, config):
if self.optimizer == "adam":
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=config.get("lr"))
elif self.optimizer == "RMSprop":
self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=config.get("lr"), weight_decay=1e-8, momentum=0.9)
else:
raise NotImplementedError
self.criterion = nn.BCEWithLogitsLoss()
self.best_val_score = 0
self.best_global_step = 0
self.start_epoch = 0
# Other methods from MyTrainable- I have verified my script runs in a clean environment and reproduces the issue.
- [Not possible] I have verified the issue also occurs with the latest wheels.
Best
Karol
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
P1Issue that should be fixed within a few weeksIssue that should be fixed within a few weeksbugSomething that is supposed to be working; but isn'tSomething that is supposed to be working; but isn'ttuneTune-related issuesTune-related issues