Source code for pysit.optimization.lbfgs



import time
import copy
from collections import deque

import numpy as np

from pysit.optimization.optimization import OptimizationBase

__all__=['LBFGS']

__docformat__ = "restructuredtext en"

[docs]class LBFGS(OptimizationBase):

    def __init__(self, objective, memory_length=None, reset_on_new_inner_loop_call=True, *args, **kwargs):
        OptimizationBase.__init__(self, objective, *args, **kwargs)
        self.prev_alpha = None
        self.prev_model = None
        # collections.deque uses None to indicate no length
        self.memory_length=memory_length
        self.reset_on_new_inner_loop_call = reset_on_new_inner_loop_call

        self._reset_memory()

    def _reset_memory(self):
        self.memory = deque([], maxlen=self.memory_length)
        self._reset_line_search = True
        self.prev_model = None

[docs]    def inner_loop(self, *args, **kwargs):

        if self.reset_on_new_inner_loop_call:
            self._reset_memory()

        OptimizationBase.inner_loop(self, *args, **kwargs)

    def _select_step(self, shots, current_objective_value, gradient, iteration, objective_arguments, **kwargs):
        """Compute the LBFGS update for a set of shots.

        Gives the step s as a function of the gradient vector.  Implemented as in p178 of Nocedal and Wright.

        Parameters
        ----------
        shots : list of pysit.Shot
            List of Shots for which to compute.
        grad : ndarray
            Gradient vector.
        i : int
            Current time index.

        """

        mem = self.memory

        q = copy.deepcopy(gradient)

        x_k = copy.deepcopy(self.base_model) #Not sure if the copy is required

        # fix the 'y' variable, since we saved the previous gradient and just
        # computed the next gradient.  That is, assume we are at k+1.  the right
        # side of memory is k, but we didn't compute y_k yet.  in the y_k slot we
        # stored a copy of the (negative) gradient.  thus y_k is that negative
        # gradient plus the current gradient.
        
        #Use the same idea for the 's' variable which is defined as the model difference. 
        #We cannot use the variable 'step' at the end of the iteration, because in inner_loop()
        #the model is updated as self.base_model += step, and the modelparameter class can
        #enforce bounds and do other types of postprocessing.
        if len(mem) > 0:
            mem[-1][2] += gradient # y
            mem[-1][1] = x_k - self.prev_model #Subtraction will result a model perturbation, which is linear.
            mem[-1][0]  = 1./mem[-1][2].inner_product(mem[-1][1]) # rho
            gamma = mem[-1][1].inner_product(mem[-1][2]) / mem[-1][2].inner_product(mem[-1][2])
        else:
            gamma = 1.0

        alphas = []

        for rho, s, y in reversed(mem):
            alpha = rho * s.inner_product(q)
            t= alpha * y
            q -= t
            alphas.append(alpha)

        alphas.reverse()

        r = gamma * q

        for alpha, m in zip(alphas, mem):
            rho, s, y = m
            beta = rho*y.inner_product(r)
            r += (alpha-beta)*s

        # Search the opposite direction
        direction = -1.0*r

        alpha0_kwargs = {'reset' : False}
        if self._reset_line_search:
            alpha0_kwargs = {'reset' : True}
            self._reset_line_search = False

        alpha = self.select_alpha(shots, gradient, direction, objective_arguments,
                                  current_objective_value=current_objective_value,
                                  alpha0_kwargs=alpha0_kwargs, **kwargs)

        self._print('  alpha {0}'.format(alpha))
        self.store_history('alpha', iteration, alpha)

        step = alpha * direction

        # these copy calls might be removable
        self.prev_model = x_k
        self.memory.append([None,None,copy.deepcopy(-1*gradient)])

        return step


    def _compute_alpha0(self, phi0, grad0, reset=False, *args, **kwargs):
        if reset:
            return phi0 / (grad0.norm()*np.prod(self.solver.mesh.deltas))**2
        else:
            return 1.0