o
    QeY                    @   sN  d dl Z d dlZd dlZd dlmZ d dlm  mZ ddlm	Z	 g dZ
G dd deZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG d d! d!eZG d"d# d#eZG d$d% d%eZdS )&    N)Tensor   )_in_legacy_dygraph)LRScheduler	NoamDecayPiecewiseDecayNaturalExpDecayInverseTimeDecayPolynomialDecayLinearWarmupExponentialDecayMultiStepDecay	StepDecayLambdaDecayReduceOnPlateauCosineAnnealingDecayMultiplicativeDecay
OneCycleLRCyclicLRc                   @   sP   e Zd ZdZdddZdd Zdd
dZdd Zdd Zdd Z	e	Z
dd Zd	S )r   aV  

    LRScheduler Base class. Define the common interface of a learning rate scheduler.

    User can import it by ``from paddle.optimizer.lr import LRScheduler`` ,

    then overload it for your subclass and have a custom implementation of ``get_lr()`` .

    Otherwise, an ``NotImplementedError`` exception will be thrown.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        instance to schedule learning rate.

    Examples:
        Here is an example of a simple ``StepDecay`` implementation.

        .. code-block:: python

            import paddle
            from paddle.optimizer.lr import LRScheduler

            class StepDecay(LRScheduler):
                def __init__(self,
                            learning_rate,
                            step_size,
                            gamma=0.1,
                            last_epoch=-1,
                            verbose=False):
                    if not isinstance(step_size, int):
                        raise TypeError(
                            "The type of 'step_size' must be 'int', but received %s." %
                            type(step_size))
                    if gamma >= 1.0:
                        raise ValueError('gamma should be < 1.0.')

                    self.step_size = step_size
                    self.gamma = gamma
                    super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)

                def get_lr(self):
                    i = self.last_epoch // self.step_size
                    return self.base_lr * (self.gamma**i)

    皙?Fc                 C   sR   t |ttfstdt|t|| _t|| _|| _|| _	d | _
|   d S )Nz8The type of learning rate must be float, but received {})
isinstancefloatint	TypeErrorformattypebase_lrlast_lr
last_epochverbose	_var_namestep)selflearning_rater   r     r%   CD:\Projects\ConvertPro\env\Lib\site-packages\paddle/optimizer/lr.py__init__]   s   

zLRScheduler.__init__c                 C   s   | j S )zI
        Return lastest computed learning rate on current epoch.
        )r   r#   r%   r%   r&   __call__l   s   zLRScheduler.__call__Nc                 C   sp   |du r|  j d7  _ |  | _n|| _ t| dr|  | _n|  | _| jr6td| j | jj	| j dS dS )a}  

        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
        The new learning rate will take effect on next ``optimizer.step`` .

        Args:
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None

        N   _get_closed_form_lr%Epoch {}: {} set learning rate to {}.)
r   get_lrr   hasattrr+   r    printr   	__class____name__)r#   epochr%   r%   r&   r"   r   s   

zLRScheduler.stepc                 C   sl   |    i }| jD ]*}|| jvrq	| j| }t|tr/|jdgks)J d|j| d }|||< q	|S )zx

        Returns the state of the scheduler as a :class:`dict`.

        It is a subset of ``self.__dict__`` .
        r*   z,shape of Tensor in state_dict must be [1] {}r   )
state_keyskeys__dict__r   r   shaper   numpy)r#   
state_dictkeyvaluer%   r%   r&   r8      s"   




zLRScheduler.state_dictc                 C   s   ddg| _ dS )a  

        For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .

        ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.

        If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .

        r   r   Nr4   r(   r%   r%   r&   r3      s   
zLRScheduler.state_keysc                 C   s\   |    | jD ]}||v r|| | j|< qtd|t|t| jkr,td dS dS )z.

        Loads the schedulers state.
        zYPlease check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dictzThere are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dictN)r3   r4   r5   RuntimeErrorr   lenwarningswarn)r#   r8   r9   r%   r%   r&   set_state_dict   s   
zLRScheduler.set_state_dictc                 C   s   t )z

        For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .

        Otherwise, an ``NotImplementedError`` exception will be thrown.
        )NotImplementedErrorr(   r%   r%   r&   r-      s   zLRScheduler.get_lrr   r   FN)r1   
__module____qualname____doc__r'   r)   r"   r8   r3   r@   Zset_dictr-   r%   r%   r%   r&   r   *   s    
2
r   c                       s0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )
r   ab  

    Applies Noam Decay to the initial learning rate.

    The algorithm can be described as following.

    .. math::

        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})

    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_


    Args:
        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``NoamDecay`` instance to schedule learning rate.

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

          ?r   Fc                    s$   || _ || _tt| ||| d S rC   )d_modelwarmup_stepssuperr   r'   )r#   rH   rI   r$   r   r    r0   r%   r&   r'   "  s   zNoamDecay.__init__c                 C   sD   | j dkrd}n| j d }| jd | j  }| j| jd  t|| S )Nr   r*   g      g      )r   rI   r   rH   min)r#   abr%   r%   r&   r-   .  s
   

zNoamDecay.get_lr)rG   r   Fr1   rD   rE   rF   r'   r-   __classcell__r%   r%   rK   r&   r      s    Mr   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )	r   a  

    Piecewise learning rate scheduler.

    The algorithm can be described as the code below:

    .. code-block:: text

        boundaries = [100, 200]
        values = [1.0, 0.5, 0.1]
        if epoch < 100:
            learning_rate = 1.0
        elif 100 <= global_step < 200:
            learning_rate = 0.5
        else:
            learning_rate = 0.1

    Args:
        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
            The type of element in the list is python float.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``PiecewiseDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    r   Fc                    s$   || _ || _tt| j||d d S )N)r   r    )
boundariesvaluesrJ   r   r'   )r#   rR   rS   r   r    rK   r%   r&   r'     s
   

zPiecewiseDecay.__init__c                 C   sF   t t| jD ]}| j| j| k r| j|   S q| jt| jd  S Nr*   )ranger=   rR   r   rS   r#   ir%   r%   r&   r-     s
   zPiecewiseDecay.get_lrr   FrO   r%   r%   rK   r&   r   7  s    Mr   c                       rQ   )	r   a  

    Applies natural exponential decay to the initial learning rate.

    The algorithm can be described as following:

    .. math::

        new\_learning\_rate = learning\_rate * e^{- gamma * epoch}

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        gamma (float, optional): A Ratio to update the learning rate, should greater than 0.0 to make learning rate decay. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``NaturalExpDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    r   Fc                    s.   |dksJ d|| _ tt| ||| d S )N        zH 'gamma' must be a positive number so that the learning rate will decay.)gammarJ   r   r'   r#   r$   rZ   r   r    rK   r%   r&   r'     s   

zNaturalExpDecay.__init__c                 C   s   | j td| j | j  S )Nr   )r   mathexprZ   r   r(   r%   r%   r&   r-     s   zNaturalExpDecay.get_lrrX   rO   r%   r%   rK   r&   r     s    E	r   c                       rQ   )	r	   a  

    Applies inverse time decay to the initial learning rate.

    The algorithm can be described as following:

    .. math::

        new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``InverseTimeDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

    r   Fc                    s   || _ tt| ||| d S rC   )rZ   rJ   r	   r'   r[   rK   r%   r&   r'   .  s   
zInverseTimeDecay.__init__c                 C   s   | j d| j| j   S rT   r   rZ   r   r(   r%   r%   r&   r-   4  s   zInverseTimeDecay.get_lrrX   rO   r%   r%   rK   r&   r	     s    Gr	   c                       s4   e Zd ZdZ					d
 fdd	Zdd	 Z  ZS )r
   a  

    Applies polynomial decay to the initial learning rate.

    The algorithm can be described as following.

    If cycle is set to True, then:

    .. math::

        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})

        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr

    If cycle is set to False, then:

    .. math::

        epoch & = min(epoch, decay\_steps)

        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr


    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
        power(float, optional): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 1.0.
        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``PolynomialDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    -C6?rG   Fr   c                    sZ   |dkr	t |tsJ d|| _|| _|dksJ d|| _|| _tt| ||| d S )Nr   z* 'decay_steps' must be a positive integer.rY   zG 'power' must be greater than 0.0 so that the learning rate will decay.)	r   r   decay_stepsend_lrpowercyclerJ   r
   r'   )r#   r$   r`   ra   rb   rc   r   r    rK   r%   r&   r'     s   



zPolynomialDecay.__init__c                 C   s   | j }| j}| jr#tt| j t| j }| j dkrd}| j| }nt| j | j}| j| j dt|t|  | j	  | j S )Nr   r*   )
r   r`   rc   r\   ceilr   rL   r   ra   rb   )r#   Ztmp_epoch_numZtmp_decay_stepsZdiv_resr%   r%   r&   r-     s   

zPolynomialDecay.get_lr)r_   rG   Fr   FrO   r%   r%   rK   r&   r
   8  s    Zr
   c                       sF   e Zd ZdZ		d fdd	Z fddZ fdd	Zd
d Z  ZS )r   ae  

    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_

    When epoch < warmup_steps, learning rate is updated as:

    .. math::

            lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}

    where start_lr is the initial learning rate, and end_lr is the final learning rate;

    When epoch >= warmup_steps, learning rate is updated as:

    .. math::

            lr = learning_rate

    where ``learning_rate`` is float or any subclass of ``LRScheduler`` .

    Args:
        learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
        warmup_steps (int): total steps of warm up. It must be a positive integer.
        start_lr (float): Initial learning rate of warm up.
        end_lr (float): Final learning rate of warm up.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``LinearWarmup`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.LinearWarmup(
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.LinearWarmup(
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    r   Fc                    s   t |tpt |tpt |t}|std||| _|dkr$t |ts(J d|| _|| _|| _	||ks=J d||t
t| ||| d S )NzWthe type of learning_rate should be [int, float or LRScheduler], the current type is {}r   z+ 'warmup_steps' must be a positive integer.z*end_lr {} must be greater than start_lr {})r   r   r   r   r   r   r$   rI   start_lrra   rJ   r   r'   )r#   r$   rI   re   ra   r   r    Z
type_checkrK   r%   r&   r'     s0   




zLinearWarmup.__init__c                    s,   t t|  }t| jtr| j |d< |S )z
        Returns the state of the LinearWarmup scheduler as a :class:`dict`.

        It is a subset of ``self.__dict__`` .
        LinearWarmup_LR)rJ   r   r8   r   r$   r   r#   r8   rK   r%   r&   r8   /  s   zLinearWarmup.state_dictc                    s4   t t| | t| jtr| j|d  dS dS )z>
        Loads state_dict for LinearWarmup scheduler.
        rf   N)rJ   r   r@   r   r$   r   rg   rK   r%   r&   r@   :  s   zLinearWarmup.set_state_dictc                 C   s`   | j | jk r| j| j t| j  t| j | j S t| jtr-| j| j | j  |  S | jS rC   )	r   rI   ra   re   r   r   r$   r   r"   r(   r%   r%   r&   r-   B  s   zLinearWarmup.get_lrrX   )	r1   rD   rE   rF   r'   r8   r@   r-   rP   r%   r%   rK   r&   r     s    Z r   c                       rQ   )	r   a  

    Update learning rate by `gamma` each epoch.

    The algorithm can be described as following.

    .. math::

        new\_learning\_rate = last\_learning\_rate * gamma

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
            It should be in interval (0.0, 1.0).
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``ExponentialDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    r   Fc                    s6   |dkr|dk sJ d|| _ tt| ||| d S )NrY   rG   zM 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay.)rZ   rJ   r   r'   r[   rK   r%   r&   r'     s   
zExponentialDecay.__init__c                 C   s   | j | j| j  S rC   r^   r(   r%   r%   r&   r-        zExponentialDecay.get_lrrX   rO   r%   r%   rK   r&   r   O  s    F	r   c                       ,   e Zd ZdZ	d	 fdd	Zdd Z  ZS )
r   a  
    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5
        milestones = [30, 50]
        gamma = 0.1
        if epoch < 30:
            learning_rate = 0.5
        elif epoch < 50:
            learning_rate = 0.05
        else:
            learning_rate = 0.005

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .


    Returns:
        ``MultiStepDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    r   r   Fc                    s|   t  ttfstdt  t fddtt d D s$td|dkr,td | _	|| _
tt| ||| d S )NzTThe type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s.c                    s    g | ]} |  |d   k qS )r*   r%   ).0rW   
milestonesr%   r&   
<listcomp>  s    z+MultiStepDecay.__init__.<locals>.<listcomp>r*   z.The elements of milestones must be incrementedrG   gamma should be < 1.0.)r   tuplelistr   r   allrU   r=   
ValueErrorrl   rZ   rJ   r   r'   )r#   r$   rl   rZ   r   r    rK   rk   r&   r'     s"   
zMultiStepDecay.__init__c                 C   sN   t t| jD ]}| j| j| k r| j| j|    S q| j| jt| j  S rC   )rU   r=   rl   r   r   rZ   rV   r%   r%   r&   r-   
  s
   zMultiStepDecay.get_lrrB   rO   r%   r%   rK   r&   r     s
    Pr   c                       ri   )
r   a  
    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5
        step_size = 30
        gamma = 0.1

        learning_rate = 0.5     if epoch < 30
        learning_rate = 0.05    if 30 <= epoch < 60
        learning_rate = 0.005   if 60 <= epoch < 90
        ...

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        step_size (int): the interval to update. It must be a positive integer.
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``StepDecay`` instance to schedule learning rate.


    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    r   r   Fc                    sh   t |tstdt| |dkrtd|dkrt |ts"J d|| _|| _tt| 	||| d S )Nz7The type of 'step_size' must be 'int', but received %s.rG   rn   r   z( 'step_size' must be a positive integer.)
r   r   r   r   rr   	step_sizerZ   rJ   r   r'   )r#   r$   rs   rZ   r   r    rK   r%   r&   r'   `  s    

zStepDecay.__init__c                 C   s   | j | j }| j| j|  S rC   )r   rs   r   rZ   rV   r%   r%   r&   r-   r  s   zStepDecay.get_lrrB   rO   r%   r%   rK   r&   r     s
    Or   c                       rQ   )	r   a  
    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95 ** epoch

        learning_rate = 0.5        # epoch 0, 0.5*0.95**0
        learning_rate = 0.475      # epoch 1, 0.5*0.95**1
        learning_rate = 0.45125    # epoch 2, 0.5*0.95**2

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``LambdaDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

    r   Fc                    6   t |stdt| || _tt| ||| d S )NzMThe type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s.)callabler   r   	lr_lambdarJ   r   r'   r#   r$   rv   r   r    rK   r%   r&   r'     s   zLambdaDecay.__init__c                 C   s   | j | | j S rC   )r   rv   r   r(   r%   r%   r&   r-     rh   zLambdaDecay.get_lrrX   rO   r%   r%   rK   r&   r   w  s    J
r   c                   @   sF   e Zd ZdZ										dd
dZdd ZdddZdd ZdS )r   a  
    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
    by 2 to 10 times once model performance has no longer improvement.

    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics``
    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
    number of epochs, the learning rate will be reduced.)

    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
            It should be less than 1.0. Default: 0.1.
        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
            Default: 10.
        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
            the update is ignored. Default: 1e-8.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.


    Returns:
        ``ReduceOnPlateau`` instance to schedule learning rate.


    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step(loss)    # If you update learning rate each step
              # scheduler.step(loss)        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step(out[0])    # If you update learning rate each step
              # scheduler.step(out[0])        # If you update learning rate each epoch

    rL   r   
   r_   relr   :0yE>Fc                 C   s   |  }|dvrtd| d || _|dkrtd|| _|  }|dvr.td| d || _t|ttfs@tdt	| || _
|| _|| _|| _|| _|	| _d	| _d | _d	| _t|| _t|| _d	| _|
| _d | _d S )
N)rL   maxzmode: z is unknown!rG   z5new_lr = origin_lr * gamma and gamma should be < 1.0.)ry   abszthreshold mode: zRThe type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s.r   )lowerrr   modefactorthreshold_moder   r   r   r   r   patience	thresholdcooldownmin_lrepsiloncooldown_counterbestnum_bad_epochsr   r   r   r    r!   )r#   r$   r~   r   r   r   r   r   r   r   r    r%   r%   r&   r'   '  sF   



zReduceOnPlateau.__init__c                 C   s   g d| _ d S )N)r   r   r   r   r   r;   r(   r%   r%   r&   r3   ^  s   zReduceOnPlateau.state_keysNc                 C   sf  |du r| j d | _ n|| _ t stjj}nddlm} |}t||tj	fr=t
|jdkr4|jd dks<J d|jnt|tttjtjfsQtdt|| jdkr_|  jd8  _dS | jdu sk| || jrr|| _d| _n|  jd7  _| j| jkr| j| _d| _t| j| j | j}| j| | jkr|| _| jrtd| j | j j!| j dS dS dS dS )a  
        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
        The new learning rate will take effect on next epoch.

        Args:
            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
                'numpy.ndarray', its shape must be [1].
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None

        Examples:
            Please refer to the example of current LRScheduler.
        Nr*   r   )VarBasezthe metrics.shape should be (1L,), but the current metrics.shape is {}. Maybe that you should call paddle.mean to process it first.z^metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}r,   )"r   r   coreeagerr   Zpaddle.frameworkr   r   r7   Zndarrayr=   r6   r   r   r   Zfloat32Zfloat64r   r   r   r   
_is_betterr   r   r   r{   r   r   r   r   r    r/   r0   r1   )r#   Zmetricsr2   tmpr   Znew_lrr%   r%   r&   r"   g  sZ   

zReduceOnPlateau.stepc                 C   s|   | j dkr| jdkr|||| j  k S | j dkr$| jdkr$||| j k S | j dkr7| jdkr7|||| j  kS ||| j kS )NrL   ry   r|   r{   )r~   r   r   )r#   currentr   r%   r%   r&   r     s   zReduceOnPlateau._is_better)	rL   r   rx   r_   ry   r   r   rz   FrC   )r1   rD   rE   rF   r'   r3   r"   r   r%   r%   r%   r&   r     s    Y
7
	Gr   c                       s4   e Zd ZdZ	d fdd	Zdd Zd	d
 Z  ZS )r   a  

    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
    SGDR.

    The algorithm can be described as following.

    .. math::

        \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
        + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
        & T_{cur} \neq (2k+1)T_{max};

        \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
        \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
        & T_{cur} = (2k+1)T_{max}.

    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
    Note that this only implements the cosine annealing part of SGDR, and not the restarts.

    Args:
        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``CosineAnnealingDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
                for batch_id in range(5):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
    r   r   Fc                    sz   t |tstdt| t |ttfstdt| |dkr%t |ts)J d|| _t|| _tt| 	||| d S )NzMThe type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s.zVThe type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s.r   z$ 'T_max' must be a positive integer.)
r   r   r   r   r   T_maxeta_minrJ   r   r'   )r#   r$   r   r   r   r    rK   r%   r&   r'     s,   



zCosineAnnealingDecay.__init__c                 C   s   | j dkr| jS | j d | j d| j  dkr-| j| j| j dttj| j   d  S dttj| j  | j  dttj| j d  | j   | j| j  | j S )Nr   r*   r   )r   r   r   r   r   r\   cospir(   r%   r%   r&   r-   $  s$   


zCosineAnnealingDecay.get_lrc                 C   s2   | j | j| j  dttj| j | j   d  S )Nr*   r   )r   r   r\   r   r   r   r   r(   r%   r%   r&   r+   3  s   
z(CosineAnnealingDecay._get_closed_form_lr)r   r   F)r1   rD   rE   rF   r'   r-   r+   rP   r%   r%   rK   r&   r     s    Rr   c                       rQ   )	r   aM  
    Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95

        learning_rate = 0.5        # epoch 0,
        learning_rate = 0.475      # epoch 1, 0.5*0.95
        learning_rate = 0.45125    # epoch 2, 0.475*0.95

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``MultiplicativeDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

    r   Fc                    rt   )NzUThe type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s.)ru   r   r   rv   rJ   r   r'   rw   rK   r%   r&   r'   k  s   
zMultiplicativeDecay.__init__c                 C   s.   | j }td| jd D ]	}|| | }q|S rT   )r   rU   r   rv   )r#   Zcur_lrr2   r%   r%   r&   r-   w  s   zMultiplicativeDecay.get_lrrX   rO   r%   r%   rK   r&   r   <  s    .r   c                       sH   e Zd ZdZ							d fdd		Zd
d Zdd Zdd Z  ZS )r   uB  
    Sets the learning rate according to the one cycle learning rate scheduler.
    The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
    from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.

    It has been proposed in `Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates <https://arxiv.org/abs/1708.07120>`_.

    Please note that the default behaviour of this scheduler follows the fastai implementation of one cycle,
    which claims that “unpublished work has shown even better results by using only two phases”.
    If you want the behaviour of this scheduler to be consistent with the paper, please set ``three_phase=True`` .

    Also note that you should update learning rate each step.

    Args:
        max_learning_rate (float): The maximum learning rate. It is a python float number.
             Functionally, it defines the initial learning rate by ``divide_factor`` .
        total_steps (int): Number of total training steps.
        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
            'linear' for linear annealing. Default: 'cos'.
        three_phase (bool, optional): Whether to use three phase.
            If ``True``:
                1. The learning rate will first increase from initial learning rate to maximum learning rate.
                2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
                3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
            If ``False``:
                1. The learning rate will increase to maximum learning rate.
                2. Then it will directly decrease to minimum learning rate.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``OneCycleLR`` instance to schedule learning rate.

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(5):
                for batch_id in range(20):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()        # You should update learning rate each step

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(5):
                for batch_id in range(20):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # You should update learning rate each step
          9@r_   333333?r   Fr   c
                    s`  t |ttfstdt||dk rtdt |ttfs(tdt||dk r0tdt |ts>tdt||dkrFtd|| _t |tsWtdt||dk s_|d	krftd
|t |ttfsvtdt||t| }
t|}|r|dkrtdd|| j d	 d| | j d | jd	 | jd	 g| _| jd	 | jd  | jd | jd	  | jd | jd  | jd | jd  g| _	|
||
|g| _
n6d|| j d	 | jd	 | jd	 g| _| jd	 | jd  | jd | jd	  | jd | jd	  g| _	|
||g| _
|dkr| j| _n|dkr| j| _ntd|tt| |
||	 d S )N='max_learning_rate' must be 'float' or 'int', but received {}r   z/'max_learning_rate' must be a positive integer.z='end_learning_rate' must be 'float' or 'int', but received {}z/'end_learning_rate' must be a positive integer.z+'total_step' must be 'int', but received {}z('total_step' must be a positive integer.z,'phase_pct' must be 'float', but received {}r*   z4'phase_pct' must be between 0 and 1, but received {}z9'divide_factor' must be 'float' or 'int', but received {}g      ?z;When three_phase is True, 'phase_pct' must be less than 0.5r      r   ZlinearzC'anneal_strategy' must by one of 'cos' or 'linear', but received {})r   r   r   r   r   r   rr   total_steps_step_config_steps_size
_lr_config_cos_annealinganneal_func_linear_annealingrJ   r   r'   )r#   max_learning_rater   Zdivide_factorZend_learning_rateZ	phase_pctZanneal_strategyZthree_phaser   r    Z
initial_lrr   rK   r%   r&   r'     s   

		



zOneCycleLR.__init__c                 C   s(   t t j| d }||| d |  S Nr*   g       @)r\   r   r   )r#   re   ra   pctZcos_outr%   r%   r&   r   M  s   zOneCycleLR._cos_annealingc                 C   s   || | | S rC   r%   )r#   re   ra   r   r%   r%   r&   r   Q     zOneCycleLR._linear_annealingc                 C   s   | j }|| jkrtd|| jtt| jdd  | jD ]-\}\}}||ks1|t| j	d krK|| j|  | }| 
| j	| | j	|d  |  S qd S )Nz?Tried to step {} times. However the number of total steps is {}r*   r   )r   r   rr   r   	enumeratezipr   r   r=   r   r   )r#   Zcurrent_steprW   Zend_steprs   
percentager%   r%   r&   r-   T  s"   
zOneCycleLR.get_lr)r   r_   r   r   Fr   F)	r1   rD   rE   rF   r'   r   r   r-   rP   r%   r%   rK   r&   r   ~  s    W{r   c                       sP   e Zd ZdZ							d fdd		Zd
d Zdd Zdd Zdd Z  Z	S )r   a  
    Set the learning rate according to the cyclic learning rate (CLR) scheduler.
    The scheduler regards the process of learning rate adjustment as one cycle after another.
    It cycles the learning rate between two boundaries with a constant frequency.
    The distance between the two boundaries can be scaled on a per-iteration or per-cycle basis.

    It has been proposed in `Cyclic Learning Rates for Training Neural Networks <https://arxiv.org/abs/1506.01186>`_.

    According to the paper, the cyclic learning rate schedule has three build-in scale methods:

    * "triangular": A basic triangular cycle without any amplitude scaling.
    * "triangular2": A basic triangular cycle that reduce initial amplitude by half each cycle.
    * "exp_range": A cycle that scales initial amplitude by scale function which is defined as :math:`gamma^{iterations}` .

    The initial amplitude is defined as max_learning_rate - base_learning_rate.
    Also note that you should update learning rate each step.

    Args:
        base_learning_rate (float): Initial learning rate, which is the lower boundary in the cycle. The paper recommends
            that set the base_learning_rate to 1/3 or 1/4 of max_learning_rate.
        max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
            Since there is some scaling operation during process of learning rate adjustment,
            max_learning_rate may not actually be reached.
        step_size_up (int): Number of training steps, which is used to increase learning rate in a cycle.
            The step size of one cycle will be defined by step_size_up + step_size_down. According to the paper, step
            size should be set as at least 3 or 4 times steps in one epoch.
        step_size_down (int, optional): Number of training steps, which is used to decrease learning rate in a cycle.
            If not specified, it's value will initialize to `` step_size_up `` . Default: None
        mode (str, optional): one of 'triangular', 'triangular2' or 'exp_range'.
            If scale_fn is specified, this argument will be ignored. Default: 'triangular'
        exp_gamma (float): Constant in 'exp_range' scaling function: exp_gamma**iterations. Used only when mode = 'exp_range'. Default: 1.0
        scale_fn (function, optional): A custom scaling function, which is used to replace three build-in methods.
            It should only have one argument. For all x >= 0, 0 <= scale_fn(x) <= 1.
            If specified, then 'mode' will be ignored. Default: None
        scale_mode (str, optional): One of 'cycle' or 'iterations'. Defines whether scale_fn is evaluated on cycle
            number or cycle iterations (total iterations since start of training). Default: 'cycle'
        last_epoch (int, optional): The index of last epoch. Can be set to restart training.Default: -1, means initial learning rate.
        verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``CyclicLR`` instance to schedule learning rate.

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5, max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(5):
                for batch_id in range(20):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()        # You should update learning rate each step

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
                    max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(5):
                for batch_id in range(20):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # You should update learning rate each step
    N
triangularrG   rc   r   Fc                    s  t |ttfstdt||dk rtd|t |ts)tdt||dkr4td||d urQt |tsFtdt||dkrQtd|t |ts_tdt|t|}|d urkt|n|}|| | _|| j | _t|| _	| j	| | _
|d	vr|d u rtd
|dvrtd|| _|| _|d u r| jdkr| j| _d| _n | jdkr| j| _d| _n| jdkr| j| _d| _n|| _|| _t ||	|
 d S )Nr   r   z?'max_learning_rate' must be a positive integer, but received {}z7The type of 'step_size_up' must be int, but received {}z:'step_size_up' must be a positive integer, but received {}z9The type of 'step_size_down' must be int, but received {}z<'step_size_down' must be a positive integer, but received {}z6The type of 'exp_gamma' must be float, but received {})r   triangular2	exp_rangeza'mode' is invalid and 'scale_fn' is not specified, make sure one of 'mode' or 'scale_fn' is valid)rc   
iterationsz2'scale_mode' must be one of 'cycle' or 'iterationsr   rc   r   r   r   )r   r   r   r   r   r   rr   
cycle_sizestep_up_pctZmax_lr	amplituder~   rZ   _triangular_scale_fnscale_fn
scale_mode_triangular2_scale_fn_exp_range_scale_fnrJ   r'   )r#   Zbase_learning_rater   Zstep_size_upZstep_size_downr~   Z	exp_gammar   r   r   r    rK   r%   r&   r'     s   







zCyclicLR.__init__c                 C   s   dS )NrG   r%   r#   xr%   r%   r&   r   /  s   zCyclicLR._triangular_scale_fnc                 C   s   dd|d   S r   r%   r   r%   r%   r&   r   2  r   zCyclicLR._triangular2_scale_fnc                 C   s
   | j | S rC   )rZ   r   r%   r%   r&   r   5  s   
zCyclicLR._exp_range_scale_fnc                 C   sv   | j }d|| j  }d|| j  | }|| jkr|| j }n	d| d| j  }| j| }| j|| t| j  }|S )Nr*   rG   )r   r   r   r   r   r   evalr   )r#   r   rc   Zpct_per_cycleZscale_factorZbase_heightlrr%   r%   r&   r-   8  s   

zCyclicLR.get_lr)Nr   rG   Nrc   r   F)
r1   rD   rE   rF   r'   r   r   r   r-   rP   r%   r%   rK   r&   r   j  s    _jr   )r\   r7   r>   Zpaddler   Zpaddle.fluid.coreZfluidr   Zfluid.frameworkr   __all__objectr   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r%   r%   r%   r&   <module>   s:    /_\SR  TnfY m B m