o
    Qe(:                     @   s   d dl mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 d	d
lmZmZ d	dlmZ d	dlZg ZG dd deZdS )   )	Optimizer   )core)	framework)Variable)layers)unique_name)LayerHelper    )_C_ops_legacy_C_ops)global_scopeNc                       s   e Zd ZdZdZdZdZdZ						
					d fdd	ZdddZ	dd Z
dd Zdd Zdd Zdd Zdd Z  ZS )Lamba  
    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

    LAMB Optimizer is designed to scale up the batch size of training without losing
    accuracy, which supports adaptive element-wise updating and accurate layer-wise
    correction. For more information, please refer to `Large Batch Optimization for
    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .

    The updating of parameters follows:

    ..  math::

        m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t

        v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2

        m_t &= \frac{m_t}{\beta_1^t}

        v_t &= \frac{v_t}{\beta_2^t}

        r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}

        w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})


    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
    learning rate, :math:`\\lambda` the LAMB weight decay rate.

    Args:
        learning_rate (float|Variable, optional): the learning rate used to update parameters. \
            Can be a float value or a Variable with data type float32. Default 0.001.
        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
            Default 0.9.
        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
            Default 0.999.
        epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
        parameters (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in paramter groups \
            represents the scale of base learning_rate. \
            The default value is None in static mode, at this time all parameters will be updated.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_fluid_clip_ClipGradByNorm` ,
            :ref:`api_paddle_fluid_clip_ClipGradByValue` ). If you want better convergence, it is recommended
            to use :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
        name(str|None): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Examples:
        .. code-block:: python
            
            import paddle

            inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
            linear = paddle.nn.Linear(10, 10)
            out = linear(inp)
            loss = paddle.mean(out)
            beta1 = paddle.to_tensor([0.9], dtype="float32")
            beta2 = paddle.to_tensor([0.85], dtype="float32")
            lamb = paddle.optimizer.Lamb(learning_rate=0.002, parameters=linear.parameters(), lamb_weight_decay=0.01)
            back = out.backward()
            lamb.step()
            lamb.clear_grad()

    moment1moment2beta1_pow_accbeta2_pow_accMbP?{Gz??+?ư>NFc                    s   |d usJ |d usJ |d usJ |d usJ t t| j||d ||
d d| _|| _|| _|| _|| _|| _|||||d| _	i | _
i | _|	| _d S )N)learning_rate
parametersweight_decay	grad_clipnamelamb)beta1beta2epsilonlamb_weight_decayexclude_from_weight_decay_fn)superr   __init__type_beta1_beta2_epsilon_lamb_weight_decay_exclude_from_weight_decay_fn_default_dict_master_weights_used_master_weights_multi_precision)selfr   r!   r   r   r    r   r   r"   multi_precisionr   	__class__ ED:\Projects\ConvertPro\env\Lib\site-packages\paddle/optimizer/lamb.pyr$   f   s2   
zLamb.__init__c                 C   sz   |d u rt  }|| }| j|}|d ur7|| }| | ks)J | | ks3J ||fS d }||fS N)r   Zfind_varZ
get_tensorr-   getZ_dtypeshape)r/   r   scopeZp_tZmaster_nameZ
master_p_tr3   r3   r4   _get_parameter   s   zLamb._get_parameterc                 C   s   | j sJ |j| jv r| j|j }|S t| jtsJ |jd }t|}tj	||j
dddd}| jj }|jdd|gid|gi|jtjjjd	d
 || j|j< |S )NZ_fp32_masterr
   Zfloat32T)r   r7   valuedtypeZpersistablecastXZOut)Zin_dtypeZ	out_dtype)r%   inputsoutputsattrs)r.   r   r,   
isinstancehelperr	   r   generater   Zcreate_global_varr7   Zstartup_programZglobal_block	append_opr;   r   VarDescVarTypeFP32)r/   paramvarvar_nameblockr3   r3   r4   _create_master_weight   s.   


zLamb._create_master_weightc                 C   sh   t |tjsJ t |tr| |}|D ]}| jr,|jtjj	j
kr,| |}| | q| | qd S r5   )rA   r   Blockdict_update_param_groupr.   r;   r   rE   rF   FP16rL   _add_moments_pows)r/   rK   r   pZmaster_pr3   r3   r4   _create_accumulators   s   


zLamb._create_accumulatorsc                 C   s~   | j dur| j d | }| jo|jtjjjk}|r| j|j n|}|j}|| j	vs0|| j	| vr8t
d||| j	| | S )a
  Utility function to fetch an accumulator for a parameter
        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched
        Returns:
            accumulator variable for the parameter
        N_z.Accumulator {} does not exist for parameter {})_namer.   r;   r   rE   rF   rP   r,   r   Z_accumulators	Exceptionformat)r/   r   rH   find_masterZtarget_paramtarget_namer3   r3   r4   _get_accumulator   s$   

zLamb._get_accumulatorc              	   C   s   |j }|tjjjkrtjjj}| j| j||d | j| j||d | j| j	||t
| jtr/dn| jdgtjjjdd | j| j||t
| jtrIdn| jdgtjjjdd d S )N)r;   r   r   cpu)r   rH   r;   Z
fill_valuer7   r%   Zdevicer   )r;   r   rE   rF   rP   rG   Z_add_accumulator_moment1_acc_str_moment2_acc_str_beta1_pow_acc_strrA   r&   r   Z
LOD_TENSOR_beta2_pow_acc_strr'   )r/   rR   Z	acc_dtyper3   r3   r4   rQ      s.   



zLamb._add_moments_powsc                 C   s  t |tjsJ t |tr| |}d|j_| | j|d }| | j	|d }| | j
|d }| | j|d }| jd urI| |d rId}n| j}| |}| jo]|d jtjjjk}	|d j}
|	rq| j|
 }|j| j|
< nd }| d}t rt|d |d ||||||||| j| j| j|	 d S t rt !|d |d |||||||d |||||d| jd| jd| jd	|d
|	 d S |d |d |||||d}|d ||||d}| j| j| j||	d}|	r||d< ||d< |r||d< |j"| j#|||dd}|S )NTr
   g        	found_infr   r   r   r    r   r0   )ParamZGradZLearningRateZMoment1ZMoment2ZBeta1PowZBeta2Pow)ZParamOutZ
Moment1OutZ
Moment2OutZBeta1PowOutZBeta2PowOut)r   r   r    r   r0   ZMasterParamZMasterParamOutZ
SkipUpdate)r%   r>   r?   r@   Zstop_gradient)$rA   r   rM   rN   rO   programZ	_use_lambrZ   r\   r]   r^   r_   r*   r)   Z_create_param_lrr.   r;   r   rE   rF   rP   r   r,   r-   Z_get_auxiliary_varZin_dygraph_moder   Zlamb_r&   r'   r(   Z_non_static_moder   r   rD   r%   )r/   rK   Zparam_and_gradr   r   r   r   r   lrrX   Zp_nameZmaster_weightr`   r>   r?   r@   Zlamb_opr3   r3   r4   _append_optimize_op   s   











zLamb._append_optimize_opc                 C   sr   | d| jd | _| d| jd | _| d| jd | _| d| jd | _| d| jd | _| d}|S )Nr   r   r    r!   r"   params)r6   r+   r&   r'   r(   r)   r*   )r/   r   r3   r3   r4   rO   H  s   

zLamb._update_param_group)
r   r   r   r   r   NNNFNr5   )__name__
__module____qualname____doc__r\   r]   r^   r_   r$   r9   rL   rS   rZ   rQ   rd   rO   __classcell__r3   r3   r1   r4   r      s0    C
&Zr   )Z	optimizerr   Zfluidr   r   Zfluid.frameworkr   r   r   Zfluid.layer_helperr	   Zpaddler   r   Zpaddle.fluid.executorr   __all__r   r3   r3   r3   r4   <module>   s   