o
    Qe4                     @   sh   d dl mZ ddlmZ ddlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ g ZG d	d
 d
eZdS )   )	Optimizer   )core)	framework)Variable
name_scope    )_C_ops_legacy_C_ops)no_gradc                       s^   e Zd ZdZdZdZdZ												d fd
d	Zdd Zdd Z	dd Z
dd Z  ZS )Adamaxa   
    The Adamax optimizer is implemented based on the Adamax Optimization 
    in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
    The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
    which makes the learning rate update algorithm more stable and simple.

    The parameter ``param_out`` update rule with gradient ``grad``:

    .. math::

        t & = t + 1

        moment\_out & = {\beta}_1 * moment + (1 - {\beta}_1) * grad

        inf\_norm\_out & = max({\beta}_2 * inf\_norm + \epsilon, |grad|)

        learning\_rate & = \frac{learning\_rate}{1 - {\beta}_1^t}

        param\_out & = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}

    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_

    The original paper does not have an ``epsilon`` attribute,
    it is added here for numerical stability to prevent the division by 0 error.

    Args:
        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value or a LRScheduler. The default value is 0.001.
        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
            The default value is 0.9.
        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
            The default value is 0.999.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-08.
	parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
	    This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in paramter groups \
            represents the scale of base learning_rate. \
	    The default value is None in static mode, at this time all parameters will be updated.
	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
	    It canbe a float value as coeff of L2 regularization or \
	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
	    the regularization setting here in optimizer will be ignored for this parameter. \
	    Otherwise, the regularization setting here in optimizer will take effect. \
	    Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    **Notes**:
        **Currently, Adamax doesn't support sparse parameter optimization.**

    Examples:
        .. code-block:: python
            
            import paddle

            inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
            linear = paddle.nn.Linear(10, 10)
            inp = paddle.to_tensor(inp)
            out = linear(inp)
            loss = paddle.mean(out)

            beta1 = paddle.to_tensor([0.9], dtype="float32")
            beta2 = paddle.to_tensor([0.99], dtype="float32")

            adam = paddle.optimizer.Adamax(learning_rate=0.1,
                    parameters=linear.parameters(),
                    beta1=beta1,
                    beta2=beta2,
                    weight_decay=0.01)
            out.backward()
            adam.step()
            adam.clear_grad()


            #Note that the learning_rate of linear_2 is 0.01.
            linear_1 = paddle.nn.Linear(10, 10)
            linear_2 = paddle.nn.Linear(10, 10)
            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            out = linear_1(inp)
            out = linear_2(out)
            loss = paddle.mean(out)
            adam = paddle.optimizer.Adamax(
                learning_rate=0.1,
                parameters=[{
                    'params': linear_1.parameters()
                }, {
                    'params': linear_2.parameters(),
                    'weight_decay': 0.001,
                    'learning_rate': 0.1,
                    'beta1': 0.8
                }],
                weight_decay=0.01,
                beta1=0.9)                   
            out.backward()
            adam.step()
            adam.clear_grad()
    momentinf_normbeta1_pow_accMbP??+?:0yE>Nc	           	         s   |d usJ |d usJ |d usJ |d usJ d|  kr%dk s*t d t dd|  kr7dk s<t d t dd|ksDt dtt| j|||||d d| _|| _|| _|| _|||d| _d S )	Nr   r   z.Invaild value of beta1, expect beta1 in [0,1).z.Invaild value of beta2, expect beta2 in [0,1).z.Invaild value of epsilon, expect epsilon >= 0.)learning_rate
parametersweight_decay	grad_clipnameadamaxbeta1beta2epsilon)	
ValueErrorsuperr   __init__type_beta1_beta2_epsilon_default_dict)	selfr   r   r   r   r   r   r   r   	__class__ GD:\Projects\ConvertPro\env\Lib\site-packages\paddle/optimizer/adamax.pyr       s:   
zAdamax.__init__c                 C   sV   t |tr
| |}|D ]}| | j| | | j| | j| j|| jdgd qd S )Nr   )r   paramZ
fill_valueshape)
isinstancedict_update_param_groupZ_add_accumulator_moment_acc_str_inf_norm_acc_str_beta1_pow_acc_strr"   )r&   blockr   pr)   r)   r*   _create_accumulators   s   

zAdamax._create_accumulatorsc                 C   s.  t |tjsJ t |tr| |}| | j|d }| | j|d }| | j|d }t	 rJt
|d |d | ||||| j| j| j	 d S t rot|d |d | |||||d ||d| jd| jd| j d S |j| j|d |d | ||||d|d ||d| j| j| jdd	d
}|S )Nr   r   r   r   r   )ParamZGradZLearningRateZMomentZInfNormZBeta1Pow)ZParamOutZ	MomentOutZ
InfNormOutr   Tr!   ZinputsZoutputsattrsstop_gradient)r-   r   Blockr.   r/   _get_accumulatorr0   r1   r2   in_dygraph_moder	   Zadamax_Z_create_param_lrr"   r#   r$   Z_in_legacy_dygraphr
   r   	append_opr!   )r&   r3   Zparam_and_gradr   r   r   Z	adamax_opr)   r)   r*   _append_optimize_op   sx   



	zAdamax._append_optimize_opc              
   C   sb  t |tjsJ t |tr|D ]\}}|du s|jdu rqt rK| | j|}t  t	
|| jdd}||d W d   n1 sEw   Y  q|jj||g4 td  | | j|}|jdd|id|id| jidd	 W d   n1 s|w   Y  W d   n1 sw   Y  qdS |d
 D ]\}}|du s|jdu rqt r| | j|}|d| jd | _t  t	
|| jdd}||d W d   n1 sw   Y  q|jj||g? td* | | j|}|d| jd | _|jdd|id|id| jidd	 W d   n	1 sw   Y  W d   n	1 s)w   Y  qdS )zUpdate Beta1 Power accumulatorNTg        Fr   scaleXZOutr7   paramsr   )r-   r   r:   listr9   r<   r;   r2   r   r	   r?   r"   Zcopy_r3   programZ_optimized_guardr   r=   getr%   )r&   r3   Zparameters_and_gradsr+   Zgradr   tmpr)   r)   r*   _finish_update  s   

 


 zAdamax._finish_updatec                 C   sJ   | d| jd | _| d| jd | _| d| jd | _| d}|S )Nr   r   r   rA   )rD   r%   r"   r#   r$   )r&   r   r)   r)   r*   r/   C  s
   
zAdamax._update_param_group)r   r   r   r   NNNN)__name__
__module____qualname____doc__r0   r1   r2   r    r5   r>   rF   r/   __classcell__r)   r)   r'   r*   r      s$    i&FAr   N)Z	optimizerr   Zfluidr   r   Zfluid.frameworkr   r   Zpaddler	   r
   Zfluid.dygraphr   __all__r   r)   r)   r)   r*   <module>   s   