o
    Qel                     @   s   d dl Z d dlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 dd	l
mZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ d dlmZ ddlmZmZ d dlZg ZG dd deZdS )    N)defaultdict   )	Optimizer)LRScheduler   )core)	framework)Variable	Parameter)unique_name)layers)LayerHelper)GradientClipBase)base)Callable)_C_ops_legacy_C_opsc                   @   s   e Zd ZdZdZdZdZdZ						
		
	
	
			
d%ddZdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd  Zejejd!d" Zd#d$ Zd
S )&AdamWa  
    The AdamW optimizer is implemented based on the AdamW Optimization
    in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
    it can resolves the problem of L2 regularization failure in the Adam optimizer.

    .. math::

        t & = t + 1

        moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad

        moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad

        learning\_rate & = learning\_rate * 
            \frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t}

        param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)


    Args:
        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value or a LRScheduler. The default value is 0.001.
        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in paramter groups \
            represents the scale of base learning_rate. \
	    The default value is None in static mode, at this time all parameters will be updated.
        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
            It should be a float number or a Tensor with shape [1] and data type as float32.
            The default value is 0.9.
        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
            It should be a float number or a Tensor with shape [1] and data type as float32.
            The default value is 0.999.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-08.
        weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
        lr_ratio (function|None, optional): If it is not None, 
            the learning rate will be updated with layerwise learning rate ratio.
            Otherwise, the learning rate is the original.
            Default: None.
        apply_decay_param_fun (function|None, optional): If it is not None,
            only tensors that makes apply_decay_param_fun(Tensor.name)==True
            will be updated with weight decay. It only works when we want to specify tensors.
            Default: None.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
            The accumulators are updated at every step. Every element of the two moving-average
            is updated in both dense mode and sparse mode. If the size of parameter is very large,
            then the update may be very slow. The lazy mode only update the element that has
            gradient in current mini-batch, so it will be much more faster. But this mode has
            different semantics with the original Adam algorithm and may lead to different result.
            The default value is False.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
    **Notes**:
        **Currently, AdamW doesn't support sparse parameter optimization.**

    Examples:
        .. code-block:: python
            
            import paddle

            linear = paddle.nn.Linear(10, 10)
            inp = paddle.rand([10,10], dtype="float32")
            out = linear(inp)
            loss = paddle.mean(out)

            beta1 = paddle.to_tensor([0.9], dtype="float32")
            beta2 = paddle.to_tensor([0.99], dtype="float32")

            opt = paddle.optimizer.AdamW(learning_rate=0.1,
                    parameters=linear.parameters(),
                    beta1=beta1,
                    beta2=beta2,
                    weight_decay=0.01)
            out.backward()
            opt.step()
            opt.clear_grad()


            #Note that the learning_rate of linear_2 is 0.01.
            linear_1 = paddle.nn.Linear(10, 10)
            linear_2 = paddle.nn.Linear(10, 10)
            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            out = linear_1(inp)
            out = linear_2(out)
            loss = paddle.mean(out)
            opt = paddle.optimizer.AdamW(
                learning_rate=0.1,
                parameters=[{
                    'params': linear_1.parameters()
                }, {
                    'params': linear_2.parameters(),
                    'weight_decay': 0.001,
                    'learning_rate': 0.1,
                    'beta1': 0.8
                }],
                weight_decay=0.01,
                beta1=0.9)                   
            out.backward()
            opt.step()
            opt.clear_grad()

    moment1moment2beta1_pow_accbeta2_pow_accMbP??+?:0yE>N{Gz?Fc                 C   s  |d usJ |d usJ |d usJ |d usJ d|  kr%dk s*t d t dd|  kr7dk s<t d t dd|ksDt dt|tsSt|tjsStd|d urft|ts^J t sft	d|d urt|t
jtjjfr}tdt|t|trtd	t|| _nd | _|| _t r| jd u rtd
t|ttfstdt| |	d urt|	tstdd | _| jrt| jd tr| jD ]
}d|v sJ dq| jd d d j| _n| jd j| _t | _tdd | _d | _g | _i | _t | _| j | _!d| _|| _"t# | _$|| _%|| _&|	| _'|| _(|| _)|| _*|| _+|
| _,|| _-i | _.|||||
|	d| _/g | _0| jrWt| jd trW| jD ]
}| 1|2  qKn| j| _0d | _3d | _4i | _5d S )Nr   r   z.Invaild value of beta1, expect beta1 in [0,1).z.Invaild value of beta2, expect beta2 in [0,1).z.Invaild value of epsilon, expect epsilon >= 0.z'weight_decay should be float or Tensor.z/'lr_ratio' is unimplemented in CPU, XPU and NPUzt`parameters` argument given to the optimizer should be an iterable of paddle Tensors, but got argument type is `{}`.zv`parameters` argument should not get dict type, if parameter groups is needed, please set `parameters` as list of dictzNparameters argument given to the Optimizer should not be None in dygraph mode.z9learning rate should be float or LRScheduler, got %s herezE'grad_clip' should be an instance of GradientClipBase's derived classparamszYparams should be set in parameters if parameter groups are optimized in different optionsc                   S      t  S N)dict r!   r!   FD:\Projects\ConvertPro\env\Lib\site-packages\paddle/optimizer/adamw.py<lambda>       z AdamW.__init__.<locals>.<lambda>adamw)weight_decaybeta1beta2epsilon	lazy_mode	grad_clip)6
ValueError
isinstancefloatr   r	   	TypeErrorr   r   Zis_compiled_with_cudaNotImplementedErrorpaddleZTensoreagerformattyper    list_parameter_list_name_non_static_modeAttributeErrorr   r   Z_dtypedtypeZ_learning_rate_mapr   _accumulatorshelperZ_opti_name_listZ_accumulators_holderZ_param_device_mapZ
clear_gradZclear_gradientsZ_learning_rateset_params_name_apply_decay_param_fun_weight_decayZ
_grad_clip	_lr_ratio_beta1_beta2_epsilon
_lazy_mode_multi_precision_master_weights_default_dict_param_groups_add_param_groupcopyZ_use_multi_tensorregularization_auxiliary_vars)selflearning_rater'   r(   r)   
parametersr&   lr_ratioZapply_decay_param_funr+   r*   multi_precisionnameparam_groupr!   r!   r"   __init__   s   







	

zAdamW.__init__c                 C   s   || j |< d S r   rM   )rN   keyvalr!   r!   r"   _set_auxiliary_var  s   zAdamW._set_auxiliary_varc                 C   s   || j v r
| j | S d S r   rV   )rN   rW   r!   r!   r"   _get_auxiliary_var  s   

zAdamW._get_auxiliary_varc                 C   s   |d }t |tr|g|d< nt |trtdt||d< | j D ]
\}}||| q#t }| jD ]}|	t|d  q4|
t|d sMtd|d D ]}|dd|jd< qQ| j| dS )z
        Add a param group to parameter_list.

        Args:
            param_group (dict): The group of Tensors to be optimzed with
            different optimization options.
        r   z`optimizer parameters should be in ordered collections,but received set, please use list instead.z7some parameters appear in more than one parameter grouprO         ?N)r-   r
   r=   r/   r5   rH   items
setdefaultrI   update
isdisjointr,   getZoptimize_attrappend)rN   rT   r   kvZ	param_setgroupparamr!   r!   r"   rJ     s,   


zAdamW._add_param_groupc                 C   s   |j | jv r| j|j  }|S t| jtsJ |j d }t|}tj||j	dddd}| jj
 }|jdd|gid|gi|jtjjjd	d
 || j|j < |S )NZ_fp32_masterr   Zfloat32T)rS   shapevaluer:   ZpersistablecastXZOut)Zin_dtypeZ	out_dtype)r4   inputsoutputsattrs)rS   rG   r-   r<   r   r   generater   Zcreate_global_varrf   startup_programZglobal_block	append_opr:   r   VarDescVarTypeFP32)rN   re   varvar_nameblockr!   r!   r"   _create_master_weightB  s,   

zAdamW._create_master_weightc                 C   s~   | j dur| j d | }| jo|jtjjjk}|r| j|j n|}|j}|| j	vs0|| j	| vr8t
d||| j	| | S )a
  Utility function to fetch an accumulator for a parameter
        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched
        Returns:
            accumulator variable for the parameter
        N_z.Accumulator {} does not exist for parameter {})r7   rF   r:   r   rp   rq   FP16rG   rS   r;   	Exceptionr3   )rN   rS   re   find_masterZtarget_paramtarget_namer!   r!   r"   _get_accumulatorZ  s$   

zAdamW._get_accumulatorc              	   C   s   |j }|tjjjkrtjjj}| j| j||d | j| j||d | j| j	||t
| jtr/dn| jdgtjjjdd | j| j||t
| jtrIdn| jdgtjjjdd d S )N)r:   r   r   cpu)rS   re   r:   Z
fill_valuerf   r4   Zdevicer   )r:   r   rp   rq   rx   rr   Z_add_accumulator_moment1_acc_str_moment2_acc_str_beta1_pow_acc_strr-   rB   r	   Z
LOD_TENSOR_beta2_pow_acc_strrC   )rN   pZ	acc_dtyper!   r!   r"   _add_moments_powso  s.   



zAdamW._add_moments_powsc                 C   s   t |tjsJ t |tr| |}|D ]-}| jr,|jtjj	j
kr,| |}| | q|jtjj	j
kr<| js<td | | qd S )NzAccumulating with FP16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Adam optimizer.)r-   r   Blockr    _update_param_grouprF   r:   r   rp   rq   rx   rv   r   warningswarn)rN   ru   rP   r   Zmaster_pr!   r!   r"   _create_accumulators  s   



zAdamW._create_accumulatorsc                 C   sp  t |tjsJ t |tr| |}|\}}d}| jd ur%| |js%d}| | j|d }| | j	|d }| | j
|d }| | j|d }	| joU|d jtjjjk}
|
r`| j|d j nd }| |}t r"| jd u rsdn| |d }t | jts| jn| j d}t | jts| jn| j d}t r| d}t|d |d |||||	||||| j|| j || j!d|
d\}}}}}}d S t"j#g |d |d |||||	||d ||||	|d| jd	| j!d
dd|d|d|d| j d|
d|R  \}}}}}}d S |d g|d g|g|g|g|g|	gd}| d}|rC||d< |d g|g|g|g|	gd}| j!d|
|| j | jd u radn| |d d}t | jtrw| j|d< n| j|d< t | jtr| j|d< n| j|d< t | jtr| j|d< n| j|d< |
r||d< ||d< |j$| j%|||dd}|S )NTFr   r[   	found_infr   i  r)   r*   min_row_size_to_use_multithreadr'   r(   
with_decaycoeffrR   rQ   )ParamZGradZLearningRateZMoment1ZMoment2ZBeta1PowZBeta2PowZ
SkipUpdate)ZParamOutZ
Moment1OutZ
Moment2OutZBeta1PowOutZBeta2PowOut)r*   r   rR   r   r   rQ   ZBeta1TensorZBeta2TensorZEpsilonTensorZMasterParamZMasterParamOut)r4   rj   rk   rl   stop_gradient)&r-   r   r   r    r   r?   rS   r|   r~   r   r   r   rF   r:   r   rp   rq   rx   rG   Z_create_param_lrr8   rA   rB   r	   numpyitemrC   in_dygraph_moderZ   r   Zadamw_rD   r@   rE   r   r%   ro   r4   )rN   ru   Zparam_and_gradre   Zgradr   r   r   r   r   rz   Zmaster_weightlrZ	lr_ratio_rB   rC   r   rw   rj   rk   rl   Zadamw_opr!   r!   r"   _append_optimize_op  sH  










		

	


zAdamW._append_optimize_opc                 C   s   d dd | jgS )N zWeight Decay, params:,)joinr>   )rN   r!   r!   r"   __str__  s   zAdamW.__str__c                 C   s  t | jd tsXg }| jD ]@}|jrq| durM| }t r4t|dr3| r3| j	dur3t
dnt|drF| rF| j	durFt
d|||f q| jdd|d}dS | jD ]c}tdd }|d	 D ]B}|jrmqg| dur| }t rt|dr| r| j	durt
dnt|dr| r| j	durt
d|d	 ||f qg|d
d | D  | jdd|d q[dS )a  
        Execute the optimizer and update parameters once.

        Returns:
            None

        Examples:
            .. code-block:: python

                import paddle
                
                a = paddle.rand([2,13], dtype="float32")
                linear = paddle.nn.Linear(13, 5)
                # This can be any optimizer supported by dygraph.
                opt = paddle.optimizer.AdamW(learning_rate = 0.01,
                                            parameters = linear.parameters())
                out = linear(a)
                out.backward()
                opt.step()
                opt.clear_grad()
        r   Nis_selected_rowszOAdamW don't support weight_decay with sparse parameters, please set it to None.
_is_sparse)Zlossrn   params_gradsc                   S   r   r   )r5   r!   r!   r!   r"   r#   H  r$   zAdamW.step.<locals>.<lambda>r   c                 S   s   i | ]\}}|d kr||qS )r   r!   ).0rb   rc   r!   r!   r"   
<dictcomp>^  s    zAdamW.step.<locals>.<dictcomp>)r-   r6   r    r   Z
_grad_ivarr   r   hasattrr   rL   RuntimeErrorr   ra   Z_apply_optimizerI   r   r^   r\   )rN   r   re   Zgrad_varZoptimize_opsrT   r!   r!   r"   step  s   








z
AdamW.stepc                 C   sr   | d| jd | _| d| jd | _| d| jd | _| d| jd | _| d| jd | _| d}|S )Nr'   r(   r)   r*   r&   r   )r`   rH   rB   rC   rD   rE   r@   )rN   rP   r!   r!   r"   r   d  s   
zAdamW._update_param_group)r   r   r   r   Nr   NNNFFN)__name__
__module____qualname____doc__r~   r   r   r   rU   rY   rZ   rJ   rv   r|   r   r   r   r   imperative_baseZno_gradr   Zdygraph_onlyr   r   r!   r!   r!   r"   r   "   s@    o
~$xOr   )r   collectionsr   Z	optimizerr   r   r   Zfluidr   r   Zfluid.frameworkr	   r
   r   r   Zfluid.layer_helperr   Z
fluid.clipr   Zfluid.dygraphr   r   collections.abcr    r   r   r1   __all__r   r!   r!   r!   r"   <module>   s"   