o
    Ne+                     @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d d	lmZ d d
lZd dlmZmZ dgZG dd deZd
S )    )	Optimizer)L1DecayRegularizer)L2DecayRegularizer)core)	framework)program_guard)unique_name)layers)LayerHelperN)_C_ops_legacy_C_opsMomentumc                       sT   e Zd ZdZdZ							d fdd	Zdd	 Zd
d Zdd Zdd Z	  Z
S )r   aP  

    Simple Momentum optimizer with velocity state

    This optimizer has a flag for Nestrov Momentum.

    The update equations are as follows:

    .. math::

        & velocity = mu * velocity + gradient

        & if (use\_nesterov):

        &\quad   param = param - (gradient + mu * velocity) * learning\_rate

        & else:

        &\quad   param = param - learning\_rate * velocity

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element.
        momentum (float): Momentum factor
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
            Often choose to be ``1.0/batch_size``.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.fluid as fluid
            import numpy as np

            paddle.enable_static()

            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = paddle.static.data(name='x', shape=[1, 13], dtype='float32')
                y = paddle.static.data(name='y', shape=[1], dtype='float32')
                linear = paddle.nn.Linear(13, 1)
                y_predict = linear(x)
                cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
                avg_cost = paddle.mean(cost)

                moment_optimizer = fluid.contrib.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
                moment_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(paddle.static.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)

    velocityNF      ?c
                    s   |d usJ |d usJ dd }
|
|rd n|}t t| j|||||	d d| _|| _t|| _d| _d| _t	|t
rBd| _|j| _|| _|| _i | _d S )Nc                 S   s
   t | tS )N)
isinstancer   )Zregular r   ND:\Projects\ConvertPro\env\Lib\site-packages\paddle/fluid/contrib/optimizer.py<lambda>w   s   
 z#Momentum.__init__.<locals>.<lambda>)learning_rateparameter_listregularization	grad_clipnamemomentum r   Zl2_decay)superr   __init__type	_momentumbool_use_nesterov_regularization_method_regularization_coeffr   r   _multi_precision_rescale_grad_master_weights)selfr   r   r   use_nesterovr   r   multi_precisionrescale_gradr   	predicateZ
py_regular	__class__r   r   r   k   s*   



zMomentum.__init__c                 C   s   t | jtsJ |jd }t|}tj||jdddd}| jj	
 }|jdd|gid|gi|jtjjjd	d
 || j|j< |S )NZ_fp32_masterr   Zfloat32T)r   shapevaluedtypeZpersistablecastXZOut)Zin_dtypeZ	out_dtype)r   inputsoutputsattrs)r   helperr
   r   r   generater	   Zcreate_global_varr-   Zstartup_programZglobal_block	append_opr/   r   VarDescVarTypeZFP32r%   )r&   paramvar_namevarblockr   r   r   _create_master_weight   s&   

zMomentum._create_master_weightc                 C   s~   | j dur| j d | }| jo|jtjjjk}|r| j|j n|}|j}|| j	vs0|| j	| vr8t
d||| j	| | S )a  Utility function to fetch an accumulator for a parameter

        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched

        Returns:
            accumulator variable for the parameter
        N_z.Accumulator {} does not exist for parameter {})_namer#   r/   r   r8   r9   FP16r%   r   Z_accumulators	Exceptionformat)r&   r   r:   find_masterZtarget_paramtarget_namer   r   r   _get_accumulator   s$   


zMomentum._get_accumulatorc                 C   s|   t |tjsJ |D ]1}| jr$|jtjjjkr$| 	|}| 
| j| q
|jtjjjkr4| js4td | 
| j| q
d S )NzAccumulating with FP16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Momentum optimizer.)r   r   Blockr#   r/   r   r8   r9   rA   r>   Z_add_accumulator_velocity_acc_strwarningswarn)r&   r=   
parameterspZmaster_pr   r   r   _create_accumulators   s   
zMomentum._create_accumulatorsc                 C   s*  t |tjsJ | | j|d }| |}| jo"|d jtj	j
jk}|r-| j|d j nd }t rXt|d |d ||||d ||d| jd| jd| jd| jd|\}}}d S | j| j| j| j|| jd}|d g|d g|g|gd	}	|d g|gd
}
|r||	d< ||
d< |j| j|	|
|dd}|S )Nr      mur'   regularization_methodregularization_coeffr(   )rO   r'   rP   rQ   r(   r)   )ParamZGradVelocityZLearningRate)ZParamOutZVelocityOutZMasterParamZMasterParamOutT)r   r2   r3   r4   Zstop_gradient)r   r   rG   rF   rH   Z_create_param_lrr#   r/   r   r8   r9   rA   r%   r   Z_non_static_moder   r   r   r    r!   r"   r$   r7   r   )r&   r=   Zparam_and_gradZvelocity_acclrrD   Zmaster_weightr?   r4   r2   r3   Zmomentum_opr   r   r   _append_optimize_op   sb   


	zMomentum._append_optimize_op)NFNNFr   N)__name__
__module____qualname____doc__rH   r   r>   rF   rM   rU   __classcell__r   r   r+   r   r      s    K)Zpaddle.fluid.optimizerr   Zpaddle.fluid.regularizerr   r   Zpaddle.fluidr   r   Zpaddle.fluid.frameworkr   r   r	   Zpaddle.fluid.layer_helperr
   rI   Zpaddler   r   __all__r   r   r   r   r   <module>   s   