"""Hessian update strategies for quasi-Newton optimization methods."""
"""Interface for implementing Hessian update strategies.
Many optimization methods make use of Hessian (or inverse Hessian) approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS. Some of these approximations, however, do not actually need to store the entire matrix or can compute the internal matrix product with a given vector in a very efficiently manner. This class serves as an abstract interface between the optimization algorithm and the quasi-Newton update strategies, giving freedom of implementation to store and update the internal matrix as efficiently as possible. Different choices of initialization and update procedure will result in different quasi-Newton strategies.
Four methods should be implemented in derived classes: ``initialize``, ``update``, ``dot`` and ``get_matrix``.
Notes ----- Any instance of a class that implements this interface, can be accepted by the method ``minimize`` and used by the compatible solvers to approximate the Hessian (or inverse Hessian) used by the optimization algorithms. """
"""Initialize internal matrix.
Allocate internal memory for storing and updating the Hessian or its inverse.
Parameters ---------- n : int Problem dimension. approx_type : {'hess', 'inv_hess'} Selects either the Hessian or the inverse Hessian. When set to 'hess' the Hessian will be stored and updated. When set to 'inv_hess' its inverse will be used instead. """ raise NotImplementedError("The method ``initialize(n, approx_type)``" " is not implemented.")
"""Update internal matrix.
Update Hessian matrix or its inverse (depending on how 'approx_type' is defined) using information about the last evaluated points.
Parameters ---------- delta_x : ndarray The difference between two points the gradient function have been evaluated at: ``delta_x = x2 - x1``. delta_grad : ndarray The difference between the gradients: ``delta_grad = grad(x2) - grad(x1)``. """ raise NotImplementedError("The method ``update(delta_x, delta_grad)``" " is not implemented.")
"""Compute the product of the internal matrix with the given vector.
Parameters ---------- p : array_like 1-d array representing a vector.
Returns ------- Hp : array 1-d represents the result of multiplying the approximation matrix by vector p. """ raise NotImplementedError("The method ``dot(p)``" " is not implemented.")
"""Return current internal matrix.
Returns ------- H : ndarray, shape (n, n) Dense matrix containing either the Hessian or its inverse (depending on how 'approx_type' is defined). """ raise NotImplementedError("The method ``get_matrix(p)``" " is not implemented.")
"""Hessian update strategy with full dimensional internal representation. """ # Symmetric matrix-vector product
# Until initialize is called we can't really use the class, # so it makes sense to set everything to None.
"""Initialize internal matrix.
Allocate internal memory for storing and updating the Hessian or its inverse.
Parameters ---------- n : int Problem dimension. approx_type : {'hess', 'inv_hess'} Selects either the Hessian or the inverse Hessian. When set to 'hess' the Hessian will be stored and updated. When set to 'inv_hess' its inverse will be used instead. """ self.first_iteration = True self.n = n self.approx_type = approx_type if approx_type not in ('hess', 'inv_hess'): raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.") # Create matrix if self.approx_type == 'hess': self.B = np.eye(n, dtype=float) else: self.H = np.eye(n, dtype=float)
# Heuristic to scale matrix at first iteration. # Described in Nocedal and Wright "Numerical Optimization" # p.143 formula (6.20). s_norm2 = np.dot(delta_x, delta_x) y_norm2 = np.dot(delta_grad, delta_grad) ys = np.abs(np.dot(delta_grad, delta_x)) if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0: return 1 if self.approx_type == 'hess': return y_norm2 / ys else: return ys / y_norm2
raise NotImplementedError("The method ``_update_implementation``" " is not implemented.")
"""Update internal matrix.
Update Hessian matrix or its inverse (depending on how 'approx_type' is defined) using information about the last evaluated points.
Parameters ---------- delta_x : ndarray The difference between two points the gradient function have been evaluated at: ``delta_x = x2 - x1``. delta_grad : ndarray The difference between the gradients: ``delta_grad = grad(x2) - grad(x1)``. """ if np.all(delta_x == 0.0): return if np.all(delta_grad == 0.0): warn('delta_grad == 0.0. Check if the approximated ' 'function is linear. If the function is linear ' 'better results can be obtained by defining the ' 'Hessian as zero instead of using quasi-Newton ' 'approximations.', UserWarning) return if self.first_iteration: # Get user specific scale if self.init_scale == "auto": scale = self._auto_scale(delta_x, delta_grad) else: scale = float(self.init_scale) # Scale initial matrix with ``scale * np.eye(n)`` if self.approx_type == 'hess': self.B *= scale else: self.H *= scale self.first_iteration = False self._update_implementation(delta_x, delta_grad)
"""Compute the product of the internal matrix with the given vector.
Parameters ---------- p : array_like 1-d array representing a vector.
Returns ------- Hp : array 1-d represents the result of multiplying the approximation matrix by vector p. """ if self.approx_type == 'hess': return self._symv(1, self.B, p) else: return self._symv(1, self.H, p)
"""Return the current internal matrix.
Returns ------- M : ndarray, shape (n, n) Dense matrix containing either the Hessian or its inverse (depending on how `approx_type` was defined). """ if self.approx_type == 'hess': M = np.copy(self.B) else: M = np.copy(self.H) li = np.tril_indices_from(M, k=-1) M[li] = M.T[li] return M
"""Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy.
Parameters ---------- exception_strategy : {'skip_update', 'damp_update'}, optional Define how to proceed when the curvature condition is violated. Set it to 'skip_update' to just skip the update. Or, alternatively, set it to 'damp_update' to interpolate between the actual BFGS result and the unmodified matrix. Both exceptions strategies are explained in [1]_, p.536-537. min_curvature : float This number, scaled by a normalization factor, defines the minimum curvature ``dot(delta_grad, delta_x)`` allowed to go unaffected by the exception strategy. By default is equal to 1e-8 when ``exception_strategy = 'skip_update'`` and equal to 0.2 when ``exception_strategy = 'damp_update'``. init_scale : {float, 'auto'} Matrix scale at first iteration. At the first iteration the Hessian matrix or its inverse will be initialized with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension. Set it to 'auto' in order to use an automatic heuristic for choosing the initial scale. The heuristic is described in [1]_, p.143. By default uses 'auto'.
Notes ----- The update is based on the description in [1]_, p.140.
References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """
init_scale='auto'): self.min_curvature = min_curvature else: elif exception_strategy == 'damp_update': if min_curvature is not None: self.min_curvature = min_curvature else: self.min_curvature = 0.2 else: raise ValueError("`exception_strategy` must be 'skip_update' " "or 'damp_update'.")
"""Update the inverse Hessian matrix.
BFGS update using the formula:
``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T) - 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)``
where ``s = delta_x`` and ``y = delta_grad``. This formula is equivalent to (6.17) in [1]_ written in a more efficient way for implementation.
References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """ self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H) self.H = self._syr((ys+yHy)/ys**2, s, a=self.H)
"""Update the Hessian matrix.
BFGS update using the formula:
``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y``
where ``s`` is short for ``delta_x`` and ``y`` is short for ``delta_grad``. Formula (6.19) in [1]_.
References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """ self.B = self._syr(1.0 / ys, y, a=self.B) self.B = self._syr(-1.0 / sBs, Bs, a=self.B)
# Auxiliary variables w and z if self.approx_type == 'hess': w = delta_x z = delta_grad else: w = delta_grad z = delta_x # Do some common operations wz = np.dot(w, z) Mw = self.dot(w) wMw = Mw.dot(w) # Guarantee that wMw > 0 by reinitializing matrix. # While this is always true in exact arithmetics, # indefinite matrix may appear due to roundoff errors. if wMw <= 0.0: scale = self._auto_scale(delta_x, delta_grad) # Reinitialize matrix if self.approx_type == 'hess': self.B = scale * np.eye(self.n, dtype=float) else: self.H = scale * np.eye(self.n, dtype=float) # Do common operations for new matrix Mw = self.dot(w) wMw = Mw.dot(w) # Check if curvature condition is violated if wz <= self.min_curvature * wMw: # If the option 'skip_update' is set # we just skip the update when the condion # is violated. if self.exception_strategy == 'skip_update': return # If the option 'damp_update' is set we # interpolate between the actual BFGS # result and the unmodified matrix. elif self.exception_strategy == 'damp_update': update_factor = (1-self.min_curvature) / (1 - wz/wMw) z = update_factor*z + (1-update_factor)*Mw wz = np.dot(w, z) # Update matrix if self.approx_type == 'hess': self._update_hessian(wz, Mw, wMw, z) else: self._update_inverse_hessian(wz, Mw, wMw, z)
"""Symmetric-rank-1 Hessian update strategy.
Parameters ---------- min_denominator : float This number, scaled by a normalization factor, defines the minimum denominator magnitude allowed in the update. When the condition is violated we skip the update. By default uses ``1e-8``. init_scale : {float, 'auto'}, optional Matrix scale at first iteration. At the first iteration the Hessian matrix or its inverse will be initialized with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension. Set it to 'auto' in order to use an automatic heuristic for choosing the initial scale. The heuristic is described in [1]_, p.143. By default uses 'auto'.
Notes ----- The update is based on the description in [1]_, p.144-146.
References ---------- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization" Second Edition (2006). """
self.min_denominator = min_denominator super(SR1, self).__init__(init_scale)
# Auxiliary variables w and z if self.approx_type == 'hess': w = delta_x z = delta_grad else: w = delta_grad z = delta_x # Do some common operations Mw = self.dot(w) z_minus_Mw = z - Mw denominator = np.dot(w, z_minus_Mw) # If the denominator is too small # we just skip the update. if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw): return # Update matrix if self.approx_type == 'hess': self.B = self._syr(1/denominator, z_minus_Mw, a=self.B) else: self.H = self._syr(1/denominator, z_minus_Mw, a=self.H) |