from time import time
import numpy as np
from scipy.special import expit
from scipy.optimize import minimize
from sklearn.datasets import make_low_rank_matrix
# Generate dataset
n_samples, n_features = 100, 1000
rng = np.random.RandomState(42)
# true coefficients, w_true[0] = intercept = 1
w_true = np.r_[1, rng.randn(n_features)]
X = make_low_rank_matrix(n_samples, n_features, random_state=rng)
X[:, 0] *= 1e3
X[:, -1] *= 1e3
# add intercept column as first column, makes life easier
X = np.c_[np.ones(n_samples), X]
z = X @ w_true
z += 1e-1 * rng.randn(n_samples)
# Balanced binary classification problem
y = (z > np.median(z)).astype(float)
# define loss, gradient and hessian for binary cross entropy
# intercept = w[0]
def loss(w, X, y, alpha=1):
"""Compute binary cross entropy for y in [0, 1]."""
raw_predictions = X @ w
# logaddexp(0, x) = log(1 + exp(x))
loss = np.logaddexp(0, raw_predictions) - y * raw_predictions
# add L2 penalty
loss = np.sum(loss) + alpha/2 * w[1:] @ w[1:]
return loss
def grad(w, X, y, alpha=1):
"""Compute gradient w.r.t. coefficient w."""
y_pred = expit(X @ w)
grad = X.T @ (y_pred - y) + alpha * np.r_[0, w[1:]]
return grad
def hessp(p, w, X, y, alpha=1):
"""Compute hessian @ p."""
y_pred = expit(X @ w)
# hess = X.T @ diag(y_pred * (1-y_pred)) @ X
hessp = X @ p
hessp = y_pred * (1 - y_pred) * hessp
hessp = X.T @ hessp
hessp[1:] += alpha * p[1:]
return hessp
def hess(w, X, y, alpha=1):
"""Compute hessian."""
y_pred = expit(X @ w)
hess = X.T @ np.diag(y_pred * (1 - y_pred)) @ X
hess[1:, 1:] += alpha * np.eye(n_features)
return hess
# Solver parameter
alpha = 1. # penalization
max_iter = 1000
tol = 1e-6
w0 = np.zeros(n_features+1)
results = dict()
for m, h, hp in [("L-BFGS-B", None, None),
("trust-constr", None, None),
("trust-constr", None, hessp),
("trust-constr", hess, None),
("trust-exact", hess, None),
("trust-ncg", hess, None),
("trust-ncg", None, hessp),
("trust-krylov", None, hessp),
("trust-krylov", hess, None),
]:
tic = time()
res = minimize(loss, w0, args=(X, y, alpha),
method=m, jac=grad,
hess=h, hessp=hp,
options={"gtol": tol, "maxiter": max_iter}
)
res.duration = time() - tic
if h is not None:
i = 2 # hessian was used
elif hp is not None:
i = 1 # hessian product was used
else:
i = 0
results[m + f" {i}"] = res
for k, v in results.items():
print(f"loss with {k:>14} = {v.fun:.4f}\tduration = {v.duration:.4f} success = {v.success}")
loss with L-BFGS-B 0 = 3.3034 duration = 0.0052 success = True
loss with trust-constr 0 = 3.3034 duration = 0.1694 success = True
loss with trust-constr 1 = 8.6891 duration = 0.0287 success = True
loss with trust-constr 2 = 3.3034 duration = 0.0995 success = True
loss with trust-exact 2 = 3.3034 duration = 0.1991 success = True
loss with trust-ncg 2 = 3.3034 duration = 0.0789 success = True
loss with trust-ncg 1 = 16.7876 duration = 0.0021 success = False
loss with trust-krylov 1 = 69.3147 duration = 0.0010 success = False
loss with trust-krylov 2 = 3.3034 duration = 0.0735 success = True
Problem Statement
Not surprisingly, some solvers in
scipy.optimize.minimizeseem to have difficulties with poorly scaled, ill-conditioned objectives. While"L-BFGS-B"usually performs very well, I was surprised to see that, in particular,"trust-constr"did not perform as I expected.Is there room for improvement? [1] mentions elliptical trust regions (instead of only spherical ones).
See the details section for more information.
[1] Nocedal, Wright "Numerical Optimization"
Background
Scikit-Learn considered to use some form of preconditioning/(un-)scaling by default for logistic regression with default solver lbfgs, see scikit-learn/scikit-learn#15583.
Details
Result:
Findings:
hessp, indicated byi=1in the output, did not converge. Trust-constr result object shows success with message"'xtol' termination condition is satisfied."