Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion optuna/_gp/acqf.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,33 @@ def __init__(
gpr: GPRegressor,
search_space: SearchSpace,
threshold: float,
normalized_params_of_running_trials: np.ndarray | None = None,
stabilizing_noise: float = 1e-12,
) -> None:
self._gpr = gpr
self._stabilizing_noise = stabilizing_noise
self._threshold = threshold

if normalized_params_of_running_trials is not None:
normalized_params_of_running_trials = torch.from_numpy(
normalized_params_of_running_trials
)

# NOTE(sawa3030): To handle running trials, the `best` constant liar strategy is
# currently implemented, as it is simple and performs well in our benchmarks.
# We plan to implement Monte-Carlo based approaches (e.g., BoTorch’s fantasize)
# in the near future.
# See https://github.com/optuna/optuna/pull/6430 for details.
constant_liar_value = self._gpr._y_train.max()
Copy link
Copy Markdown
Contributor

@nabenabe0928 nabenabe0928 Feb 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add an inline comment about this design choice?
Namely, we could use mean (posterior mean), median, min, or any kind of operators, right?
Plus, as you may already know, we often integrate out the value because we can calculate the posterior distributions for each running trial.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the comment. I have added a few inline comments. I also left some notes on issue #6392 regarding the future implementation plan.

constant_liar_y = constant_liar_value.expand(
normalized_params_of_running_trials.shape[0]
)

self._gpr.append_running_data(
normalized_params_of_running_trials,
constant_liar_y,
)

super().__init__(gpr.length_scales, search_space)

def eval_acqf(self, x: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -205,7 +227,7 @@ def __init__(
assert (
len(constraints_gpr_list) == len(constraints_threshold_list) and constraints_gpr_list
)
self._acqf = LogEI(gpr, search_space, threshold, stabilizing_noise)
self._acqf = LogEI(gpr, search_space, threshold, None, stabilizing_noise)
self._constraints_acqf_list = [
LogPI(_gpr, search_space, _threshold, stabilizing_noise)
for _gpr, _threshold in zip(constraints_gpr_list, constraints_threshold_list)
Expand Down
38 changes: 37 additions & 1 deletion optuna/_gp/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ def __init__(
self._is_categorical = is_categorical
self._X_train = X_train
self._y_train = y_train
self._X_all = X_train
self._y_all = y_train
self._squared_X_diff = (X_train.unsqueeze(-2) - X_train.unsqueeze(-3)).square_()
if self._is_categorical.any():
self._squared_X_diff[..., self._is_categorical] = (
Expand Down Expand Up @@ -146,6 +148,40 @@ def _cache_matrix(self) -> None:
self.noise_var = self.noise_var.detach()
self.noise_var.grad = None

def append_running_data(self, X_running: torch.Tensor, y_running: torch.Tensor) -> None:
Copy link
Copy Markdown
Collaborator

@kAIto47802 kAIto47802 Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can compute this more efficiently by reusing self._cov_Y_Y_chol, which we already have.
This would reduce the computational cost from the current
   $$O((\text{n\_train} + \text{n\_running})^3)$$
   $$= O(\text{n\_train}^3 + 3\,\text{n\_train}^2\:\text{n\_running} + 3\,\text{n\_train}\:\text{n\_running}^2 + \text{n\_running}^3),$$
to
   $$O(\text{n\_train}^2\:\text{n\_running} + \text{n\_train}\:\text{n\_running}^2 + \text{n\_running}^3).$$
Considering the fact that the $\text{n\_train}$ is the dominant size ($\text{n\_train} >> \text{n\_running}$), the reducing the order of $\text{n\_train}$ from 3 to 2 will improve runtime.
This will also eliminate the need for storing self._cov_Y_Y.
However, it is also acceptable to leave the current one and address this in a follow-up PR, since the current implementation itself is not incorrect.


We can calculate this as follows:

Let $A$ be the symmetric positive definite matrix (in this case, self._cov_Y_Y) and define its Cholesky decomposition as $LL^\top=A$ (in this case, $L$ corresponds to self._cov_Y_Y_chol). Let $M$ be a symmetric positive definite matrix partitioned as:
   $M$ = [ [ $A\:$ $X$ ] [ $X^\top\:$ $Y$ ] ]
(in this case, $X$ corresponds to self.kernel(X_running), and $Y$ corresponds to self.kernel(X_running, X_running)).
Then, the Cholesky decomposition of $M$ is provided as follows:
   $\mathrm{chol}(M)$ = [ [ $L\:$ $0$ ] [ $Z^\top\:$ $C$ ] ],
where $Z := L^{-1} X$ and $C := \mathrm{chol}(Y - Z^\top Z)$.
Note that we can compute $Z$ efficiently by solving $LZ = X$, since $L$ is the triangular matrix. (Use scipy.linalg.solve_triangular.)

proof.
Assume that $M$ admits a block Cholesky factorization of the form:
   $M$ := [ [ $L\:$ $0$ ] [ $B\:$ $C$ ] ] [ [ $L^\top\:$ $B^\top$ ] [ $0\:$ $C^\top$ ] ]
   = [ [ $LL^\top\:$ $LB^\top$ ] [ $BL^\top\:$ $BB^\top + CC^\top$ ] ].
Comparing this to $M$ = [ [ $A\:$ $X$ ] [ $X^\top\:$ $Y$ ] ], we obtain:
   $LB^\top = X$ and $BB^\top + CC^\top = Y$.   (1)
Thus, it follows that $B^\top = L^{-1}X$. Defining $Z := L^{-1}X$, we have $B = Z^\top$, and hence $BB^\top = Z^\top Z$.
Substituting this into the Equation (1) yields:
   $CC^\top = Y - Z^\top Z$
   $= Y - (L^{-1}X)^\top L^{-1}X$
   $= Y - X^\top (L^{-1})^\top L^{-1}X$
   $= Y - X A^{-1} X$.
Since $M$ is symmetric positive definite, the Schur complement $Y - X A^{-1} X = Y - Z^\top Z$ is also symmetric positive definite. Therefore, its Cholesky factor exists and $C$ can be chosen as $C = \mathrm{chol}(Y - Z^\top Z)$.
This completes the proof. □

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you so much for the precise explanation! I made the update accordingly.

Comment thread
not522 marked this conversation as resolved.
assert self._cov_Y_Y_chol is not None and self._cov_Y_Y_inv_Y is not None, (
"Call _cache_matrix before append_running_data"
)
n_train = self._X_train.shape[0]
n_running = X_running.shape[0]
n_total = n_train + n_running

cov_Y_Y_chol = np.zeros((n_total, n_total), dtype=np.float64)
cov_Y_Y_chol[:n_train, :n_train] = self._cov_Y_Y_chol.numpy()
with torch.no_grad():
kernel_running_train = self.kernel(X_running).detach().cpu().numpy()
kernel_running_running = self.kernel(X_running, X_running).detach().cpu().numpy()
kernel_running_running[np.diag_indices(n_running)] += self.noise_var.item()

cov_Y_Y_chol[n_train:, :n_train] = scipy.linalg.solve_triangular(
self._cov_Y_Y_chol.cpu().numpy(), kernel_running_train.T, lower=True
).T
cov_Y_Y_chol[n_train:, n_train:] = np.linalg.cholesky(
kernel_running_running
- cov_Y_Y_chol[n_train:, :n_train] @ cov_Y_Y_chol[n_train:, :n_train].T
)
self._y_all = torch.cat([self._y_train, y_running], dim=0)
cov_Y_Y_inv_Y = scipy.linalg.solve_triangular(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can also compute here little bit more efficiently by reusing self._cov_Y_Y_inv_Y, leading $O( (\text{n\_train} + \text{n\_running})^2)$ to $O(\text{n\_train}~\text{n\_running} + \text{n\_running}^2)$.
However, this is not very important and we can leave here as it is, since the overall complexity of append_running_data is bottlenecked by computing cov_Y_Y_chol.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the helpful suggestion! I agree that the forward substitution, scipy.linalg.solve_triangular(cov_Y_Y_chol, self._y_all.cpu().numpy(), lower=True) can likely be reduced to O(n_train * n_running + n_running**2) by caching the corresponding train-only term,
scipy.linalg.solve_triangular(cov_Y_Y_chol, self._y_train.cpu().numpy(), lower=True) in _cache_matrix.

What I was less certain about was whether the backward substitution could also be reduced to the same complexity. As you mentioned, I think this would be a good optimization to consider in a future improvement.

Copy link
Copy Markdown
Collaborator

@kAIto47802 kAIto47802 Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the reply. Indeed, yes. We need $W := AX^{-1}$ by solving $L^\top W = Z$, which takes $O(\text{n\_train}^2 + \text{n\_running})$. Also, the overall complexity of append_running_data is already bottlenecked by the computation of cov_Y_Y_chol, which we cannot improve more.
Therefore, we can achieve only a constant-factor improvement here, which is the reason why I felt this is not very important in contrast to the one I mentioned here. Also, I'm not very certain how this constant-factor improvement contribute to the overall runtime without conducting benchmarking on it.

cov_Y_Y_chol.T,
scipy.linalg.solve_triangular(cov_Y_Y_chol, self._y_all.cpu().numpy(), lower=True),
lower=False,
)

# NOTE(nabenabe): Here we use NumPy to guarantee the reproducibility from the past.
self._cov_Y_Y_chol = torch.from_numpy(cov_Y_Y_chol)
self._cov_Y_Y_inv_Y = torch.from_numpy(cov_Y_Y_inv_Y)
self._X_all = torch.cat([self._X_train, X_running], dim=0)

def kernel(
self, X1: torch.Tensor | None = None, X2: torch.Tensor | None = None
) -> torch.Tensor:
Expand Down Expand Up @@ -193,7 +229,7 @@ def posterior(self, x: torch.Tensor, joint: bool = False) -> tuple[torch.Tensor,
)
is_single_point = x.ndim == 1
x_ = x if not is_single_point else x.unsqueeze(0)
mean = torch.linalg.vecdot(cov_fx_fX := self.kernel(x_), self._cov_Y_Y_inv_Y)
mean = torch.linalg.vecdot(cov_fx_fX := self.kernel(x_, self._X_all), self._cov_Y_Y_inv_Y)
# K @ inv(C) = V --> K = V @ C --> K = V @ L @ L.T
V = torch.linalg.solve_triangular(
self._cov_Y_Y_chol,
Expand Down
15 changes: 13 additions & 2 deletions optuna/_gp/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,25 @@ def __init__(
def get_normalized_params(
self,
trials: list[FrozenTrial],
trial_params: list[dict[str, Any]] | None = None,
) -> np.ndarray:
values = np.empty((len(trials), len(self._optuna_search_space)), dtype=float)

# `trial_params` can be passed explicitly when `trials` contain unfinished trials.
# In that case, parameter values may not be available in `trial.params`, so callers
# should provide them explicitly. Otherwise, they are taken from `trials`.
if trial_params is None:
trial_params = [trial.params for trial in trials]
Comment thread
not522 marked this conversation as resolved.

for i, (param, distribution) in enumerate(self._optuna_search_space.items()):
if isinstance(distribution, CategoricalDistribution):
values[:, i] = [distribution.to_internal_repr(t.params[param]) for t in trials]
values[:, i] = [
distribution.to_internal_repr(trial_param[param])
for trial_param in trial_params
]
else:
values[:, i] = _normalize_one_param(
np.array([trial.params[param] for trial in trials]),
np.array([trial_param[param] for trial_param in trial_params]),
self._scale_types[i],
(self._bounds[i, 0], self._bounds[i, 1]),
self._steps[i],
Expand Down
90 changes: 79 additions & 11 deletions optuna/samplers/_gp/sampler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import json
from typing import Any
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -49,6 +50,10 @@

EPS = 1e-10

_RELATIVE_PARAMS_KEY = "gp:relative_params"
# The value of system_attrs must be less than 2046 characters on RDBStorage.
_SYSTEM_ATTR_MAX_LENGTH = 2045


def _standardize_values(values: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
clipped_values = gp.warn_and_convert_inf(values)
Expand Down Expand Up @@ -300,32 +305,69 @@ def _get_best_params_for_multi_objective(
chosen_indices = self._rng.rng.choice(n_pareto_sols, size=size, replace=False)
return pareto_params[chosen_indices]

def _get_normalized_params_of_running_trials(
self,
trials: list[FrozenTrial],
internal_search_space: gp_search_space.SearchSpace,
) -> np.ndarray | None:
running_trials = [
t
for t in trials
if internal_search_space._optuna_search_space.keys() <= _get_params(t).keys()
and t.state == TrialState.RUNNING
]
if len(running_trials) == 0:
return None
return internal_search_space.get_normalized_params(
running_trials, [_get_params(t) for t in running_trials]
)

def sample_relative(
self, study: Study, trial: FrozenTrial, search_space: dict[str, BaseDistribution]
) -> dict[str, Any]:
if search_space == {}:
return {}

states = (TrialState.COMPLETE,)
trials = study._get_trials(deepcopy=False, states=states, use_cache=True)
states = (TrialState.COMPLETE, TrialState.RUNNING)
# At present, running trials are taken into account only in single-objective
# unconstrained optimization.
use_cache = len(study.directions) > 1 or self._constraints_func is not None
trials = study._get_trials(deepcopy=False, states=states, use_cache=use_cache)
completed_trials = [t for t in trials if t.state == TrialState.COMPLETE]

if len(trials) < self._n_startup_trials:
if len(completed_trials) < self._n_startup_trials:
return {}

# Force CPU device for all torch operations to avoid issues when
# torch.set_default_device("cuda") is set globally (issue #6113).
with torch.device("cpu"):
return self._sample_relative_impl(study, trials, search_space)
params = self._sample_relative_impl(study, completed_trials, trials, search_space)

if params != {}:
# Share the params obtained by the relative sampling with the other processes.
params_str = json.dumps(params)
for i in range(0, len(params_str), _SYSTEM_ATTR_MAX_LENGTH):
study._storage.set_trial_system_attr(
trial._trial_id,
f"{_RELATIVE_PARAMS_KEY}:{i // _SYSTEM_ATTR_MAX_LENGTH}",
params_str[i : i + _SYSTEM_ATTR_MAX_LENGTH],
)

return params

def _sample_relative_impl(
self, study: Study, trials: list[FrozenTrial], search_space: dict[str, BaseDistribution]
self,
study: Study,
completed_trials: list[FrozenTrial],
trials: list[FrozenTrial],
search_space: dict[str, BaseDistribution],
) -> dict[str, Any]:
internal_search_space = gp_search_space.SearchSpace(search_space)
normalized_params = internal_search_space.get_normalized_params(trials)
normalized_params = internal_search_space.get_normalized_params(completed_trials)

_sign = np.array([-1.0 if d == StudyDirection.MINIMIZE else 1.0 for d in study.directions])
standardized_score_vals, _, _ = _standardize_values(
_sign * np.array([trial.values for trial in trials])
_sign * np.array([trial.values for trial in completed_trials])
)

if (
Expand Down Expand Up @@ -363,6 +405,11 @@ def _sample_relative_impl(
gpr=gprs_list[0],
search_space=internal_search_space,
threshold=standardized_score_vals[:, 0].max(),
normalized_params_of_running_trials=(
self._get_normalized_params_of_running_trials(
trials, internal_search_space
)
),
)
best_params = normalized_params[np.argmax(standardized_score_vals), np.newaxis]
else:
Expand All @@ -379,7 +426,9 @@ def _sample_relative_impl(
else:
if n_objectives == 1:
assert len(gprs_list) == 1
constraint_vals, is_feasible = _get_constraint_vals_and_feasibility(study, trials)
constraint_vals, is_feasible = _get_constraint_vals_and_feasibility(
study, completed_trials
)
y_with_neginf = np.where(is_feasible, standardized_score_vals[:, 0], -np.inf)
# TODO(kAIto47802): If all trials are infeasible, the acquisition function
# for the objective function can be ignored, so skipping the computation
Expand All @@ -403,7 +452,9 @@ def _sample_relative_impl(
None if np.isneginf(best_feasible_y) else normalized_params[i_opt, np.newaxis]
)
else:
constraint_vals, is_feasible = _get_constraint_vals_and_feasibility(study, trials)
constraint_vals, is_feasible = _get_constraint_vals_and_feasibility(
study, completed_trials
)
constr_gpr_list, constr_threshold_list = self._get_constraints_acqf_args(
constraint_vals, internal_search_space, normalized_params
)
Expand Down Expand Up @@ -442,8 +493,8 @@ def sample_independent(
) -> Any:
if self._warn_independent_sampling:
states = (TrialState.COMPLETE,)
complete_trials = study._get_trials(deepcopy=False, states=states, use_cache=True)
if len(complete_trials) >= self._n_startup_trials:
completed_trials = study._get_trials(deepcopy=False, states=states, use_cache=True)
if len(completed_trials) >= self._n_startup_trials:
self._log_independent_sampling(trial, param_name)
return self._independent_sampler.sample_independent(
study, trial, param_name, param_distribution
Expand Down Expand Up @@ -479,3 +530,20 @@ def _get_constraint_vals_and_feasibility(
is_feasible = np.all(constraint_vals <= 0, axis=1)
assert not isinstance(is_feasible, np.bool_), "MyPy Redefinition for NumPy v2.2.0."
return constraint_vals, is_feasible


def _get_params(trial: FrozenTrial) -> dict[str, Any]:
if trial.state.is_finished():
return trial.params

params_strs = []
i = 0
while params_str_i := trial.system_attrs.get(f"{_RELATIVE_PARAMS_KEY}:{i}"):
params_strs.append(params_str_i)
i += 1

if len(params_strs) == 0:
return trial.params
params = json.loads("".join(params_strs))
params.update(trial.params)
return params
49 changes: 49 additions & 0 deletions tests/gp_tests/test_gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,52 @@ def test_posterior(x_shape: tuple[int, ...]) -> None:
), "Diagonal Check."
assert torch.allclose(covar, covar.transpose(-2, -1)), "Symmetric Check."
assert torch.all(torch.det(covar) >= 0.0), "Postive Semi-definite Check."


@pytest.mark.parametrize("n_running", [1, 5])
def test_append_running_data(n_running: int) -> None:
dim = 3
rng = np.random.RandomState(0)
X = torch.from_numpy(rng.random(size=(10, dim)))
Y = torch.from_numpy(rng.randn(10))
Y = (Y - Y.mean()) / Y.std()
log_prior = prior.default_log_prior
minimum_noise = prior.DEFAULT_MINIMUM_NOISE_VAR
gtol: float = 1e-2
gpr = GPRegressor(
X_train=X,
y_train=Y,
is_categorical=torch.from_numpy(np.zeros(X.shape[-1], dtype=bool)),
inverse_squared_lengthscales=torch.ones(X.shape[1], dtype=torch.float64),
kernel_scale=torch.tensor(1.0, dtype=torch.float64),
noise_var=torch.tensor(1.0, dtype=torch.float64),
)._fit_kernel_params(
log_prior=log_prior,
minimum_noise=minimum_noise,
deterministic_objective=False,
gtol=gtol,
)

X_running = torch.from_numpy(rng.random(size=(n_running, dim)))
y_running = torch.from_numpy(rng.randn(n_running))

reference_gpr = GPRegressor(
X_train=torch.cat([X, X_running], dim=0),
y_train=torch.cat([Y, y_running], dim=0),
is_categorical=torch.from_numpy(np.zeros(X.shape[-1] + n_running, dtype=bool)),
inverse_squared_lengthscales=gpr.inverse_squared_lengthscales.clone(),
kernel_scale=gpr.kernel_scale.clone(),
noise_var=gpr.noise_var.clone(),
)
reference_gpr._cache_matrix()

gpr.append_running_data(X_running, y_running)

assert torch.allclose(reference_gpr._cov_Y_Y_chol, gpr._cov_Y_Y_chol)
assert torch.allclose(reference_gpr._cov_Y_Y_inv_Y, gpr._cov_Y_Y_inv_Y)

x = torch.from_numpy(rng.random(size=(1, dim)))
mean, var = gpr.posterior(x)
reference_mean, reference_var = reference_gpr.posterior(x)
assert torch.allclose(mean, reference_mean)
assert torch.allclose(var, reference_var)