o
    Nev.                    @   s  d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl
mZ d dlmZmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZm Z m!Z!m"Z" ddlm#Z# ddl$m%Z% ddl&m'Z' ddlm(Z( ddl)m*Z+ ddl)m,Z, ddl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl4m6Z6 ddl7m8Z8 ddlm9Z: d dl;Z;d dl	m<Z<m=Z= ddl>m?Z?m@Z@mAZA g dZBG dd deCZDG dd  d eDZEG d!d" d"eDZFG d#d$ d$eDZGG d%d& d&eDZHG d'd( d(eDZIG d)d* d*eDZJG d+d, d,eDZKG d-d. d.eDZLG d/d0 d0eDZMG d1d2 d2eDZNG d3d4 d4eDZOG d5d6 d6eDZPG d7d8 d8eJZQeEZReFZSeIZTeJZUeKZVeLZWeMZXeNZYeOZZePZ[eHZ\eQZ]G d9d: d:eDZ^G d;d< d<eCZ_G d=d> d>eCZ`G d?d@ d@eDZaG dAdB dBeCZbG dCdD dDeCZcdS )E    )print_functionN)defaultdict)find_distributed_lookup_table)ProgramVariable	Parameter
name_scopedefault_main_programdefault_startup_programdevice_guard   )	framework)layers)unique_name)append_backward_some_in_set__append_grad_suffix__get_no_grad_set_name)GradientClipBaseGradientClipByNormerror_clip_callbackappend_gradient_clip_opsClipGradByGlobalNorm)program_guard)Constant)LayerHelper)ops)base)no_grad)LearningRateDecay_LearningRateEpochDecay)core)tensor)reduce)
cmp_to_key)signature_safe_contextmanager   )compat)_C_ops_legacy_C_ops)_in_legacy_dygraphin_dygraph_mode_current_expected_place)SGDMomentumAdagradAdamAdamaxDpsgdDecayedAdagradFtrlSGDOptimizerMomentumOptimizerAdagradOptimizerAdamOptimizerAdamaxOptimizerDpsgdOptimizerDecayedAdagradOptimizerRMSPropOptimizerFtrlOptimizerAdadeltaAdadeltaOptimizerModelAverageLarsMomentumLarsMomentumOptimizerLambOptimizerExponentialMovingAveragePipelineOptimizerLookaheadOptimizerRecomputeOptimizerc                   @   sx  e Zd ZdZej						dDddZejdd Z	ejd	d
 Z
e
Zdd Zdd Zdd Zdd Zejdd Zejdd ZdEddZdd Zdd Zdd Zdd  Z		!			dFd"d#Z		!			dFd$d%Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Z				dGd2d3ZdEd4d5Z 	dEd6d7Z!d8d9 Z"d:d; Z#d<d= Z$dEd>d?Z%ejd@dA Z&ej			dHdBdCZ'dS )I	OptimizerzOptimizer Base class.

    Define the common interface of an optimizer.
    User should not use this class directly,
    but need to use one of it's implementation.
    NFc           
      C   st  ddl m} |durt|nd| _|| _t rLt|tt	|fs(t
dt| | jdu r1td|durK| jD ]}	|	jdurJtd|    nq8nt|ttj|fs]t
dt| |durjt|tsjt
d|| _|| _|| _|| _|| _d| _| jr| jd j| _t | _t| jtjr| j| jt < tdd	 | _i | _t | j!j"| _#g | _$i | _%t | _&t | _'dS )
a-  
        Args:
            flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads. 
                If true, the parameters and gradients will be coalesce to contiguous mempry, 
                and the grad_clip ops / optimizer ops will be fuse to one operator.
        r   LRSchedulerNz9learning rate should be float or LRScheduler, got %s herezRparameter_list argument given to the Optimizer should not be None in dygraph mode.If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!zE'grad_clip' should be an instance of GradientClipBase's derived classc                   S   s   t  S N)dict rO   rO   FD:\Projects\ConvertPro\env\Lib\site-packages\paddle/fluid/optimizer.py<lambda>   s    z$Optimizer.__init__.<locals>.<lambda>)(paddle.optimizer.lrrK   list_parameter_list_namer   _non_static_mode
isinstancefloatr   	TypeErrortypeAttributeErrorregularizerlogginginfo__str__r   r   regularization
_grad_clip_learning_rate_flatten_param_grads_align_size_dtypedtyperN   _learning_rate_mapr	   r   _accumulators_global_accumulatorsr   	__class____name__helper_opti_name_list_accumulators_holder_param_device_map_auxiliary_vars)
selflearning_rateparameter_listr`   	grad_clipflatten_param_grads
align_sizenamerK   paramrO   rO   rP   __init__D   s   




zOptimizer.__init__c                 C   s   ddl m} i }| j D ]\}}| D ]	\}}|||j< qq| j D ]	\}}|||j< q%t| j|r>| j |d< |S t| jt	rk| j |d< t| jt
skd}tjdddd}tjdgd| jj|d	 ||d< |S )
a   
        Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
        If the optimizer never be called(minimize function), the state_dict is empty.

        Args: None
        Return:
            state_dict(dict) : dict contains all the variable used by optimizer
        
        Examples:
            .. code-block:: python

                import paddle.fluid as fluid

                with fluid.dygraph.guard():
                    emb = fluid.dygraph.Embedding([10, 10])

                    adam = fluid.optimizer.Adam(0.001, parameter_list=emb.parameters())
                    state_dict = adam.state_dict()

        r   rJ   LR_SchedulerNglobal_stepint32)rw   rf   r   )out)rR   rK   rh   itemsrw   ri   rW   rb   
state_dictr   r    r   Z_varbase_creatorr"   fill_constantstep_num)rq   rK   r   kv	para_namevar_tmpZvar_temprO   rO   rP   r      s4   zOptimizer.state_dictc           
      C   s  ddl m} t| j|r| j|d  t| jtr{| j|d  t| jts{d|v s0J d|d }t|trZ|}t	|
  }|jdksQJ d|jt|d | j_n!t|tjrt|jdksmJ d|j|d | j_ntdt|d	d
 }|| _| j D ]\}}| D ]\}}	|	j|v sJ d|	j|||	 qq| j D ]\}}|j|v sJ d|j||| qdS )aE  
        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.

        Args: 
            state_dict(dict) : Dict contains all the Variable needed by optimizer
        Return:
            None
        
        Examples:
            .. code-block:: python

                import paddle
                import paddle.fluid as fluid

                paddle.disable_static()

                emb = paddle.nn.Embedding(10, 10)

                state_dict = emb.state_dict()
                fluid.save_dygraph(state_dict, "paddle_dy")

                scheduler = paddle.optimizer.lr.NoamDecay(	
                    d_model=0.01, warmup_steps=100, verbose=True)
                adam = paddle.optimizer.Adam(
                    learning_rate=scheduler,
                    parameters=emb.parameters())
                state_dict = adam.state_dict()
                fluid.save_dygraph(state_dict, "paddle_dy")

                para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
        r   rJ   rz   r{   z\Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dictr   z*global step shape is (1,), the shape is {}zUType not supprt, value in state dict must be [VarBase, Variable, numpy], the type is c                 S   s   |  }| }t|}| |j }t|tr| }nt|tj	r'| }nt|tj
r0|}ntdtt||j|jksMJ d|j|j|j|j|jks_J d|j|j|j||t  d S )NzState dict type {} not supprtzkParameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}zlParameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {})value
get_tensornparrayrw   rW   r   numpyr!   ZVarBasendarrayRuntimeErrorformatstrrZ   shaperf   setr   r,   )r   rx   varr"   Zmodel_npZ	load_paraZload_para_nprO   rO   rP   _load_state_para  s0   





z2Optimizer.set_state_dict.<locals>._load_state_parazoptimizer variable {} not foundN)rR   rK   rW   rb   set_dictr   r    r   r   r   r   r   r   r   intr   r   r   rZ   rn   rh   r~   rw   ri   )
rq   r   rK   r{   Zstep_npr   r   r   r   r   rO   rO   rP   set_state_dict   sR   !





zOptimizer.set_state_dictc                 C   s   | j S rM   )rm   rq   rO   rO   rP   get_opti_var_name_list/  s   z Optimizer.get_opti_var_name_listc                 C   s   || j |< d S rM   rp   )rq   keyvalrO   rO   rP   _set_auxiliary_var2  s   zOptimizer._set_auxiliary_varc                 C   s   || j v r
| j | S d S rM   r   )rq   r   rO   rO   rP   _get_auxiliary_var5  s   

zOptimizer._get_auxiliary_varc                 C   s  ddl m} t| j|rX|  }t|tjsEtd}|| j_	| j
j|dgdd| jd u r.dn| jd}t }| j|_||_|| jt < t|  }| j
j|t|dd	 d S t rt| jtr|  }t|tjrnd S tjtddgt| j| jd u rdn| jdd
| jt < d S t| jtr|  | jt < d S td|  }t|tjrd S t| jtstdtjtddgt| j| jd u rdn| jdd
| jt < d S )Nr   rJ   rr   r   Tfloat32)rw   r   persistablestop_gradientrf   r   initializerrw   r   r   rf   r   z<optimizer's learning rate must be float or LearningRateDecayzllearning rate variable is create outside optimizer,can not create new learning rate variable for new program)rR   rK   rW   rb   _global_learning_rater   r   r   generateZ	_var_namerl   create_global_variablere   r	   Zlr_shedulerlr_varrg   rX   set_variable_initializerr   imperative_baseenabledr   create_global_varr   rY   )rq   rK   r   Zlr_nameZ	main_progZlr_valuelrrO   rO   rP   _create_global_learning_rate;  s~   

z&Optimizer._create_global_learning_ratec              
   C   s$  t |tjtfstdt| t | jtrtdt |trw|| _| 	 }|durut
 rAt }t|t|jt||j| dS t rWt|dt|d|jdt|j dS t  }|jdd|gi|jt|jt|d	d
d dS dS t|jdkr|jd dksJ d|| jt < dS )a|  
        :api_attr: imperative
        
        Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay,
        this API cannot be invoked, because it will lead to conflict.

        Args:
            value (float|Variable): the value of learning rate

        Returns:
            None
          
        Examples:
            .. code-block:: python

                import paddle.fluid as fluid
                        
                with fluid.dygraph.guard():
                    linear = fluid.dygraph.nn.Linear(10, 10)

                    adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters())

                    # set learning rate manually by python float value
                    lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
                    for i in range(5):
                        adam.set_lr(lr_list[i])
                        lr = adam.current_step_lr()
                        print("current lr is {}".format(lr))
                    # Print:
                    #    current lr is 0.2
                    #    current lr is 0.3
                    #    current lr is 0.4
                    #    current lr is 0.5
                    #    current lr is 0.6


                    # set learning rate manually by framework Variable
                    lr_var = fluid.layers.create_global_var(
                        shape=[1], value=0.7, dtype='float32')
                    adam.set_lr(lr_var)
                    lr = adam.current_step_lr()
                    print("current lr is {}".format(lr))
                    # Print:
                    #    current lr is 0.7



        zSThe type of 'value' in optimizer.set_lr must be (float, Variable), but received %s.znoptimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict.Nr   rf   r   r   Out)rf   r   r   T)rZ   outputsattrsr   r   r   z:optimizer's learning rate must be 1-D Tensor with shape[1])rW   r   r   rX   rY   rZ   rb   r   r   r   r+   r,   r(   Zfull_rS   r   rf   r*   r)   r   r	   global_block	append_oplenrg   )rq   r   
current_lrplacer   rO   rO   rP   set_lr  sZ   2

zOptimizer.set_lrc                 C   s~   |   }t|tjr|    d S t| jtr| jS t| jtr+|  }| d S | j }t|tt	fr9|S | d S )a  
        :api_attr: imperative
        
        Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
        otherwise return the step learning rate.

        Returns:
            float: The learning rate of the current step.

        Examples:
            .. code-block:: python

                import paddle.fluid as fluid
                import numpy as np

                # example1: LearningRateDecay is not used, return value is all the same
                with fluid.dygraph.guard():
                    emb = fluid.dygraph.Embedding([10, 10])
                    adam = fluid.optimizer.Adam(0.001, parameter_list = emb.parameters())
                    lr = adam.current_step_lr()
                    print(lr) # 0.001

                # example2: PiecewiseDecay is used, return the step learning rate
                with fluid.dygraph.guard():
                    inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
                    linear = fluid.dygraph.nn.Linear(10, 10)
                    inp = fluid.dygraph.to_variable(inp)
                    out = linear(inp)
                    loss = fluid.layers.reduce_mean(out)
                    
                    bd = [2, 4, 6, 8]
                    value = [0.2, 0.4, 0.6, 0.8, 1.0]
                    adam = fluid.optimizer.Adam(fluid.dygraph.PiecewiseDecay(bd, value, 0),
                                           parameter_list=linear.parameters())

                    # first step: learning rate is 0.2
                    np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True

                    # learning rate for different steps
                    ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
                    for i in range(12):
                        adam.minimize(loss)
                        lr = adam.current_step_lr()
                        np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True

        r   )
r   rW   r   r   r   rb   rX   r    stepr   )rq   r   Zstep_lrrO   rO   rP   current_step_lr  s   0
zOptimizer.current_step_lrc                 C   s   |du rt  }| j|dS )zC
        get global decayed learning rate
        :return:
        N)r   r	   rg   get)rq   programrO   rO   rP   r     s   zOptimizer._global_learning_ratec                 C   s   t  )zP append optimize operator to block and return all the added optimize_op
        NotImplementedError)rq   blockparam_and_gradrO   rO   rP   _append_optimize_op!  s   zOptimizer._append_optimize_opc              	   C   s   |d }|j d }t|tkr|S |dkr|  S t jdd+ td |  | W  d    W  d    S 1 s>w   Y  W d    d S 1 sNw   Y  d S )Nr   rr         ?T)Zis_with_optZscale_with_param_lr)optimize_attrrZ   r   r   r	   _lr_schedule_guardr   r   )rq   r   rx   Zparam_lrrO   rO   rP   _create_param_lr&  s   

RzOptimizer._create_param_lrc                 C      dS )zCreate all accumulators needed by the parameters

        Args:
            block: the block in which the loss variable is present
            parameters: list of parameter variables for the optimizer
        NrO   )rq   r   
parametersrO   rO   rP   _create_accumulators5  s   zOptimizer._create_accumulatorsc                 C   r   )a  Finish any custom updates needed
           before completing an optimization step

        Args:
            block: the block in which the loss variable is present
            parameters: list of parameter variables for the optimizer

        Returns:
            None
        NrO   )rq   r   parameters_and_gradsrO   rO   rP   _finish_update>  s   zOptimizer._finish_update        c           
      C   s  | j dur| j d | }|| jv r.|j| j| v r.t r%| j| |j S td||j|dkr5|j}t| j	t
s=J |jd | }t|}| j| | j	j|d|pX|jt ratjjjn|du rh|jn||dd}	|du rx| |j}t| | j	j|	tt|dd W d   n1 sw   Y  t rt| jdkr|| jv sJ d	||	| j|  |	| j| |j< |	S )
a  Utility function to add an accumulator for a parameter

        Args:
            block: the block in which the loss variable is present
            name: name of the accumulator
            param: parameter variable for which accumulator is to be added
            dtype: data type of the accumulator variable
            fill_value: value to initialize the accumulator variable
        N_z.Accumulator {} already exists for parameter {}Trw   r   rf   rZ   r   belong_to_optimizerr   r   r   ,Optimizer set error, {} should in state dict)rU   rh   rw   r   rV   	Exceptionr   r   rW   rl   r   r   r   rm   appendr   rf   r!   VarDescVarType
LOD_TENSORrZ   _get_device_for_paramr   r   r   rX   r   rn   	set_value)
rq   rw   rx   rf   
fill_valuer   rZ   devicevar_namer   rO   rO   rP   _add_accumulatorK  sT   


	
zOptimizer._add_accumulatorc           	      C   s6  | j dur| j d | }|| jv r!t r| j| S td||dkr(dg}t| jts0J |}t	
|}| j| | jj|d|rF|n| j||dd}|du rTd}t| | jj|tt|dd	 W d   n1 spw   Y  t rt| jd
kr|| jv sJ d||| j|  || j|< |S )a  Utility function to add a global accumulator for all parameters in the model

        Args:
            block: the block in which the loss variable is present
            name: name of the accumulator
            dtype: data type of the accumulator variable
            fill_value: value to initialize the accumulator variable
            shape: the shape of the accumulator
            type: the variable type of the accumulator
            device: the target place of the accumulator
        Nr   z$Global accumulator {} already existsr   Tr   cpur   r   r   r   )rU   ri   r   rV   r   r   rW   rl   r   r   r   rm   r   r   re   r   r   r   rX   r   rn   r   )	rq   rw   rf   r   r   rZ   r   r   r   rO   rO   rP   _add_global_accumulator  sF   





z!Optimizer._add_global_accumulatorc                 C   sT   | j dur| j d | }|| jvs|j| j| vr"td||j| j| |j S )zUtility function to fetch an accumulator for a parameter

        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched

        Returns:
            accumulator variable
        Nr   .Accumulator {} does not exist for parameter {})rU   rh   rw   r   r   )rq   rw   rx   rO   rO   rP   _get_accumulator  s   


zOptimizer._get_accumulatorc                 C   s:   | j dur| j d | }|| jvrtd|| j| S )zUtility function to fetch a global accumulator

        Args:
            name: name of the accumulator

        Returns:
            accumulator variable
        Nr   z$Global accumulator {} does not exist)rU   ri   r   r   rq   rw   rO   rO   rP   _get_global_accumulator  s
   
	

z!Optimizer._get_global_accumulatorc           	      C   sb   |D ],}|d j du r.|d j}|j}tj }|D ]}|j}||v r-||| j|<  nqqd S )Nr   T)		trainablerw   r   r!   op_proto_and_checker_makerkOpDeviceAttrNameinput_arg_namesattrro   )	rq   r   target_blockr   
param_namer   Zdevice_attr_nameopr   rO   rO   rP   _update_param_device_map  s   


z"Optimizer._update_param_device_mapc                 C   s   d }|| j v r| j | }|S rM   )ro   )rq   r   r   rO   rO   rP   r     s   

zOptimizer._get_device_for_paramc           
      C   s  t   }|}t   }|j|jkr%|jdksJ dt  j|j }t|j}| 	|| | 
|dd |D  |   t  r]|D ]}|d du rNqE|d jdu r[| || qEnb|D ]_}|d du rhq_|d jj|E td	1 |d jdu r| |d j}t| | ||}W d   n1 sw   Y  W d   n1 sw   Y  W d   n1 sw   Y  q_| || t|j}	|||	S )
a  Add optimization operators to update gradients to variables.

        Args:
          parameters_and_grads(list(tuple(Variable, Variable))):
            a list of (variable, gradient) pair to update.

        Returns:
          return_op_list: a list of operators that will complete one step of
            optimization. This will include parameter update ops, global step
            update ops and any other custom ops required by subclasses to manage
            their internal state.
        rI   zFcurrent block is not global_block, but it doesn't have backward block.c                 S   s   g | ]}|d  j r|d  qS r   )r   .0prO   rO   rP   
<listcomp>  s    z7Optimizer._create_optimization_pass.<locals>.<listcomp>r   Nr   T	optimizer)r   r	   r   current_blockidxZbackward_block_idxblocksr   r   r   r   r   rV   r   r   r   r   _optimized_guardr   r   rw   r   r   Z
_slice_ops)
rq   r   r   r   r   startr   r   Zoptimize_opendrO   rO   rP   _create_optimization_pass  sd   

 
z#Optimizer._create_optimization_passc              	   C   s  t  }t   }t|}d}d}g }|D ]\}}	|j|kr,|dur'td|}|	}q|||	f qd}
|dur||g}|jj	|4 t 
d |   |jd||| |dd|d id}
W d   n1 smw   Y  W d   n1 s|w   Y  |||f|
fS )	a  
        Because distribute lookup table only support SGD optimizer for now, not support
        other optimizer and regularization, so we should find the table parameter out,
        and avoid to add regularization and other op for it, and add sgd optimize op
        for it independently.
        :param param_grads(list((Var, Var))): list of (param, grad) pair.
        :param loss: the loss variable.
        :param startup_program: the startup program
        Nz1multi dist table var found, only support one now!r   sgdParamGradLearningRateParamOutr   rZ   inputsr   )r   r	   r   r   rw   r   r   r   r   r   r   r   r   r   )rq   Zparam_gradsr   r   Z
table_nameZtable_paramZ
table_gradZnew_param_gradsr   gsgd_opr   rO   rO   rP   _process_distribute_lookuptable4  sD   


 z)Optimizer._process_distribute_lookuptablec                 C   s  d}t  rn| ||}| jdu r|j| _t  r?|r|n| j}g }|D ]}|js+q%| dur<| }	|||	f q%|S |du rGt	g}nt
|tsNJ |jj}
t|jdkr`|jd dkshJ d|j|rl|n| j}t|
| t||||}W d   |S 1 sw   Y  |S )a  
        The first part of ``minimize``, do auto-diff to append backward operations for
        the current program.

        Args:
            loss (Variable): ``loss`` variable to run optimizations.
            startup_program (Program, optional): :ref:`api_fluid_Program` for
                initializing parameters in ``parameter_list``. The default value
                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
            no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
                to be updated. The default value is None.
            callbacks (list, optional): list of callable objects to run when appending backward
                operator for one parameter. The default value is None.

        Return:
            list: list of (param, grad) variable pairs, param is ``Parameter``,
                grad is the gradient value corresponding to the parameter.

        Examples:
            See examples in ``apply_gradients``.
        Nr   r   zThe loss.shape should be (1L,), but the current loss.shape is {}. Maybe that you should call paddle.mean to process the current loss.)r   rV   _get_no_grad_setre   rf   rT   r   Z
_grad_ivarr   r   rW   rS   r   r   r   r   r   r   r   )rq   lossstartup_programrs   no_grad_set	callbacksZact_no_grad_setparams_gradsrx   grad_varr   rO   rO   rP   backward^  sP   

zOptimizer.backwardc                 C   s   |du st |drt |dr|jdu r|du r|S d}t |dr.|jdur.||||j}n|dur9||||j}|dus?J t rJt||gS |}|jtj	j
jkrj|jj|jt  |j|j|jtj	j
jd}d||gi}d|gi}|jjd||d |S )zu Create and add backward regularization Operators
    
        Function helper of append_regularization_ops.
        Nr\   )rw   rf   r   	lod_levelrZ   Xr   sumr   )hasattrr\   r   r   rV   r)   r  rZ   r!   r   r   SELECTED_ROWS
create_varrw   ZkNewGradSuffixrf   r   r	  r   r   )rq   rx   gradr`   Zregularization_termnew_gradr   r   rO   rO   rP   _create_regularization_of_grad  s:   

z(Optimizer._create_regularization_of_gradc              
   C   s   g }t  r|D ]\}}| |||}|||f q|S d}t dP |D ]D\}}|sDt|dddurD|durDd}td|   |j	j
||g | |||}|||f W d   n1 sfw   Y  q'W d   |S 1 sww   Y  |S )ar  Create and add backward regularization Operators
    
        Creates and adds backward regularization operators in the BlockDesc.
        This will add gradients of the regularizer function to the gradients
        of the parameters and return these modified gradients. This is the
        same as implementing weight decay in optimizers for regularization.
    
        Args:
            parameters_and_grads: A list of (parameters, gradients) pairs
                                  that need to be regularized.
            regularization: A global regularizer. If the parameter is not
                            set. It will be applied with regularizer.
    
        Returns:
            list[(Variable, Variable)]: list of (parameters, gradients) \
            pair with the regularized gradient
    
        Raises:
            Exception: Unknown regularization type
        Fr`   r\   NTrL   )r   rV   r  r   r   getattrr]   r^   r_   r   r   r   )rq   r   r`   Zparams_and_gradsrx   r  r  Zrepeate_regularizerrO   rO   rP   append_regularization_ops  sH   
z#Optimizer.append_regularization_opsc           
   
   C   s  g }g }|D ]6\}}|d u rqd|_ t|dddu s"t|dd d ur2td|j d| _|  S || || qdd |D }|d j}| j	j
d	d|d jt|gdd
}d|_|d j|_|d j|_| j	j
dd|d jt|gdd
}	tt 6 |jdd|i||ddd| j|d jdd |jdd|i||	ddd| j|d jdd W d    n1 sw   Y  | j	j|tdd | j	j|	tdd ||	fgS )NTZ	need_clipFr\   zmflatten_param_grads=True will be discarded since paramter '{}''s need_clip is False or the regularizer is setc                 S   s   g | ]}t |jqS rO   )r   prodr   r   rO   rO   rP   r         z1Optimizer.flatten_param_grads.<locals>.<listcomp>r   flatten_param)rw   r   rf   r   r   flatten_gradcoalesce_tensorInputOutputZFusedOutput)	copy_data	use_alignrv   rf   rZ   r   r   r   r   r   )r   r  warningswarnr   rw   rc   r   r   rl   r   rf   r   r  r   r   r\   r   r	   r   rd   r   r   )
rq   r  Zneed_flatten_paramsZneed_flatten_gradsr   r   r   r   r  r  rO   rO   rP   ru     s   




zOptimizer.flatten_param_gradsc                 C   sz   t |dd d}| jr | jdu r | jdkst| jtr | |}| jdur+| |}nt|}| || j}| 	|}|S )a  
        Second part of `minimize`, appending optimization operators for
        given `params_grads` pairs.

        Args:
            params_grads (list): list of (param, grad) pair to do optimization.

        Returns:
            list: A list of operators appended to the current program.

        Examples:
            .. code-block:: python

                import paddle.fluid as fluid
                loss = network()
                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
                params_grads = optimizer.backward(loss)
                # you may append operations for params_grads here
                # ...
                optimizer.apply_gradients(params_grads)
        c                 S   
   | d j S Nr   rw   xrO   rO   rP   rQ   ]     
 z+Optimizer.apply_gradients.<locals>.<lambda>r   N)
sortedrc   r`   ra   rW   r   ru   r   r  r   )rq   r  optimize_opsrO   rO   rP   apply_gradientsG  s   


zOptimizer.apply_gradientsc                 C   s   t  r6tt  t   | jdur| |}| || j}| |}W d   |S 1 s/w   Y  |S |j	j
}t|| | |}W d   |S 1 sPw   Y  |S )a  
        Second part of `minimize`, appending optimization operators for
        given `params_grads` pairs.
        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            params_grads (list): list of (param, grad) pair to do optimization.
        Returns:
            list: A list of operators appended to the current program.
        N)r   rV   r   r	   r
   ra   r  r`   r   r   r   r*  )rq   r  r  r  r)  r   rO   rO   rP   apply_optimizer  s*   



zOptimizer.apply_optimizec                 C   s8   t |}|jj  }tdd |D }|| |S )Nc                 S   s   g | ]
}|j d u r|jqS )F)r   rw   r   rx   rO   rO   rP   r     s    z.Optimizer._get_no_grad_set.<locals>.<listcomp>)r   r   r   r   all_parametersr   update)rq   r  r  r   Zparam_no_trainablerO   rO   rP   r    s   
zOptimizer._get_no_grad_setc                 C   s   | j D ]	}|jr|  qdS )a  
        Clear the gradients of all optimized parameters for model.

        If not, new gradient will accumulat on previous gradient.
        
        Returns:
            None
        
        Examples:
            .. code-block:: python

                import paddle.fluid as fluid
                import numpy as np

                with fluid.dygraph.guard():
                    value = np.arange(26).reshape(2, 13).astype("float32")
                    a = fluid.dygraph.to_variable(value)
                    linear = fluid.Linear(13, 5, dtype="float32")
                    # This can be any optimizer supported by dygraph.
                    adam = fluid.optimizer.Adam(learning_rate = 0.01, 
                                                parameter_list = linear.parameters())
                    out = linear(a)
                    out.backward()
                    adam.minimize(out)
                    adam.clear_gradients()

        N)rT   r   Zclear_gradient)rq   r   rO   rO   rP   clear_gradients  s
   
zOptimizer.clear_gradientsc                 C   sJ   t |ts	J d|r|n| j}| j||||d}| j|||d}||fS )a  
        Add operations to minimize ``loss`` by updating ``parameter_list``.

        Args:
            loss (Variable): A ``Variable`` containing the value to minimize.
            startup_program (Program, optional): :ref:`api_fluid_Program` for
                initializing parameters in ``parameter_list``. The default value
                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
            no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
                to be updated. The default value is None.

        Returns:
            tuple: tuple (optimize_ops, params_grads), A list of operators appended
            by minimize and a list of (param, grad) variable pairs, param is
            ``Parameter``, grad is the gradient value corresponding to the parameter.
            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
            indicate program pruning. If so, the program will be pruned by ``feed`` and 
            ``fetch_list`` before run, see details in ``Executor``.

        Examples:
            Please refer to the example of current Optimizer.
        The loss should be an Variable.r  rs   r  r  r  )rW   r   rT   r  r+  rq   r  r  rs   r  r  r)  rO   rO   rP   minimize  s   zOptimizer.minimize)NNNFrI   NrM   )Nr   NNNNNNNNNN)(rk   
__module____qualname____doc__r   r   ry   r   Zdygraph_onlyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  ru   r*  r+  r  r/  r4  rO   rO   rO   rP   rH   <   s    S
1
bE
W

?		
<
6C,

E,
/K+


 rH   c                       sH   e Zd ZdZ					d fdd	Zdd Zdd	 Zed
d Z  Z	S )r5   aX  
    Optimizer of the stochastic gradient descent algorithm.

    .. math::

        param\_out = param - learning\_rate * grad

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.fluid as fluid
            import numpy as np

            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
                y_predict = fluid.layers.fc(input=x, size=1, act=None)
                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                avg_cost = fluid.layers.mean(cost)

                sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
                sgd_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(fluid.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)

    NFc                    sB   |d usJ t t| j|||||d d| _d| _|| _i | _d S )Nrr   rs   r`   rt   rw   r   F)superr5   ry   rZ   Z_use_mkldnn_multi_precision_master_weights)rq   rr   rs   r`   rt   multi_precisionrw   rj   rO   rP   ry     s   
zSGDOptimizer.__init__c                 C      |j | jv r| j|j  }|S t| jtsJ |j d }t|}tj||j	dddd}| jj
 }|jdd|gid|gi|jtjjjd	d
 || j|j < |S NZ_fp32_masterr   r   Tr   castr
  r   in_dtype	out_dtyper  rw   r=  rW   rl   r   r   r   r   r   r   r  r   r   rf   r!   r   r   FP32rq   rx   r   r   r   rO   rO   rP   _create_master_weight1  ,   

z"SGDOptimizer._create_master_weightc                 C   st   t |tjsJ t |tr| |}|D ]#}| jr'|jtjj	j
kr'| |}q|jtjj	j
kr7| js7td qd S )NzAccumulating with FP16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Adam optimizer.)rW   r   BlockrN   Z_update_param_groupr<  rf   r!   r   r   FP16rI  r  r   rq   r   r   r   Zmaster_prO   rO   rP   r   I  s   


z!SGDOptimizer._create_accumulatorsc           
      C   s   | j o|d jtjjjk}|r| j|d j nd }| |}t	 r0t
|d ||d || d S t rEt|d ||d ||d | d S t|tjsMJ |d |d |d}d|d i}d|i}|rk||d< ||d< |j| j|||dd	}	|	S )
Nr   r   r   r   r>  MasterParamMasterParamOutTrZ   r   r   r   r   )r<  rf   r!   r   r   rL  r=  rw   r   r+   r(   Zsgd_r*   r)   r   rW   r   rK  r   rZ   )
rq   r   r   find_mastermaster_weightr   r   r   r   r   rO   rO   rP   r   Y  sJ   


z SGDOptimizer._append_optimize_op)NNNFN)
rk   r7  r8  r9  ry   rI  r   r   r   __classcell__rO   rO   r?  rP   r5     s    9r5   c                       s@   e Zd ZdZdZ					d fdd	Zdd Zd	d
 Z  ZS )r6   a  

    Simple Momentum optimizer with velocity state

    This optimizer has a flag for Nestrov Momentum.

    The update equations are as follows:

    .. math::

        & velocity = mu * velocity + gradient

        & if (use\_nesterov):

        &\quad   param = param - (gradient + mu * velocity) * learning\_rate

        & else:

        &\quad   param = param - learning\_rate * velocity

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element.
        momentum (float): Momentum factor
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.fluid as fluid
            import numpy as np

            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
                y_predict = fluid.layers.fc(input=x, size=1, act=None)
                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                avg_cost = fluid.layers.mean(cost)

                moment_optimizer = fluid.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
                moment_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(fluid.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)

    velocityNFc                    sL   |d usJ |d usJ t t| j|||||d d| _|| _t|| _d S )Nr:  momentum)r;  r6   ry   rZ   	_momentumbool_use_nesterov)rq   rr   rU  rs   use_nesterovr`   rt   rw   r?  rO   rP   ry     s   zMomentumOptimizer.__init__c                 C   ,   t |tjsJ |D ]	}| | j| q
d S rM   )rW   r   rK  r   _velocity_acc_strrq   r   r   r   rO   rO   rP   r        z&MomentumOptimizer._create_accumulatorsc                 C   s   t |tjsJ | | j|d }| |}d }t r9t|d |d ||||d ||d| j	d| j
\}}}d S | j	| j
d}|d g|d g|g|gd}|d g|gd}	|j| j||	|dd	}
|
S )
Nr   r   murY  r^  rY  r   r   Velocityr   r   ZVelocityOutTrP  )rW   r   rK  r   r[  r   rV   r)   rU  rV  rX  r   rZ   )rq   r   r   velocity_accr   rR  r   r   r   r   momentum_oprO   rO   rP   r     s:   

z%MomentumOptimizer._append_optimize_op)NFNNN)	rk   r7  r8  r9  r[  ry   r   r   rS  rO   rO   r?  rP   r6     s    Er6   c                       s   e Zd ZdZdZdZddgddddddf fdd		Zd
d Zdd Zdd Z	d!ddZ
d"ddZdd Zdd Zd#ddZdd Zdd Zejdd  Z  ZS )$DGCMomentumOptimizera  
	:api_attr: Static Graph

    DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887

    DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\
        only gradients larger than a threshold are transmitted.

    To avoid losing information, DGC accumulates the rest of the gradients locally.

    Eventually, these gradients become large enough to be transmitted.

    Thus, DGC sends the large gradients immediately but eventually sends all of the gradients over time.

    To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance.

    DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.

    This optimizer will do two things:

        1. Compress the gradient by get TopK import value from tensor \
            and use it for allreduce to reduce network bandwidth.

        2. Call momentum to optimize the cost.

    Args:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            It can be a float value or a Variable with one float value as a data element.
        momentum (float): Momentum factor.
        rampup_begin_step (int): The beginning step from which gradient compression is implemented.
        rampup_step (int): Time steps used in sparsity warm-up periods. Default is 1.
            For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. \
                And when reach sparsity array ends, it will use 0.999 then and after.
        sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \
            Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \
                the top [1%, 0.1%] important element will be transmitted.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support 
            :ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None, 
            meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

    Examples:
        .. code-block:: python

            import paddle.fluid as fluid
            optimizer = fluid.optimizer.DGCMomentumOptimizer(
                        learning_rate=0.0001,
                        momentum=0.9,
                        rampup_step=1000,
                        rampup_begin_step=1252,
                        sparsity=[0.999, 0.999])

    Z_dgc_u_Z_dgc_v_r   +?NFc                    s  t  rtdt sJ d|d usJ |d usJ tt| j|||	|
|d d| _|| _	t
|| _|dks<J d|| _|| _|| _d | _d | _d | _|
d ur{t|
ts[tdt|tshJ dt| |dkspJ d	|| _|
j|d
  | _| | j\| _| _d S )Nz/In dygraph, don't support DGCMomentumOptimizer.zBPaddle is not compiled with CUDA. DGC is only support GPU for now.r:  dgc_momentumr   zrampup_begin_step must >= 0zrThe type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNormz9The type of num_trainers should be 'int', but received %sz3The value of num_trainers should be greater than 0!      )r   rV   r   r!   is_compiled_with_cudar;  re  ry   rZ   rV  rW  rX  _rampup_begin_step_rampup_step	_sparsity_rampup_begin_step_var_global_step_var_dgc_clip_normrW   r   rY   r   Z_num_trainers	clip_norm_get_regularization_paramr`   regular_typeregular_coeff)rq   rr   rU  rampup_begin_steprampup_stepsparsityrs   rY  Znum_trainersr`   rt   rw   r?  rO   rP   ry   M  sX   


zDGCMomentumOptimizer.__init__c                 C   sb   d}d}|d ur-|j }ddlm}m} t||rd}||fS t||r)d}||fS J d||fS )Nr   r   r   )L1DecayL2Decayr&   Fz+regularization must be None|L1Decay|L2Deacy)Z_regularization_coeffr\   rw  rx  rW   )rq   r`   rr  rs  rw  rx  rO   rO   rP   rq    s   

z.DGCMomentumOptimizer._get_regularization_paramc                 C   sT   t tdd |j}|dk s&|jtjjjks&|jtjjjks&|jtjjj	kr(dS dS )Nc                 S      | | S rM   rO   r%  yrO   rO   rP   rQ         z2DGCMomentumOptimizer._is_use_dgc.<locals>.<lambda>i @  FT)
absr#   r   rZ   r!   r   r   r  rf   rG  )rq   	param_varr  Z	var_numelrO   rO   rP   _is_use_dgc  s   z DGCMomentumOptimizer._is_use_dgcc           	      C   s   t |tjsJ | | j|d }|d usJ |d |d || |d}|d |d}| j| jd}| |d |d s@d}nd}|	| j
| jd |	d	|d i |	d
t| ji |j||||dd}|S )Nr   r   r`  rb  r_  rU  rg  )current_stepnranksGrad_outrt  TrP  )rW   r   rK  r   _u_velocity_acc_strr   rV  rX  r  r.  rn  _nranks_varrX   rj  r   )	rq   r   r   rc  r   r   r   rZ   Zdgc_momentum_oprO   rO   rP   r     s<   z(DGCMomentumOptimizer._append_optimize_opc                 C   sz   t d}|j|ddgdd\}}|r;|j|tt|d ddd |j jdd	|gid
|gidt|idd d|_|S )Nglobal_step_counterr   r   Trw   rf   r   r   r   	force_cpur   	incrementr
  r   r   rP  )	r   create_or_get_global_variabler   r   rX   main_programr   Z_prepend_opr   )rq   counter_namebeginr   rl   counter
is_new_varrO   rO   rP   _add_auto_increment_var  s(   




z,DGCMomentumOptimizer._add_auto_increment_varrI   c                 C   sH   t d}|j|ddgdd\}}|r"|j|tt|ddd d|_|S )Nr  r   r   Tr  r  r   )r   r  r   r   rX   r   )rq   rw   r   rl   r  r  rO   rO   rP   _add_nranks_var  s   

z$DGCMomentumOptimizer._add_nranks_varc                 C   s  t  }d|_| jtj dd| _| jtj dd| _	t
jdgtjjjdtj | jd dd| _t| jj| _|D ]\}}| | j|}| ||sMq;| | j|}t
jdg|jd|jtj  d	dd}t
jdg|jd|jtj  d	d
d}t
jdg|jd|jtj  d	d
d}	tj}
tjj j!}|" j#D ]8}| $|sq|% |
&  }|j|vrq|'|j |'|j t(|dkr|)|
& | q|*|
&  q|}| j+d ur| ,|| j+}| -||||||||	 q;d S )NTr   )r  r  rI   )rw   r   r   r   )r   rf   r   rw   r   r  r   F).r	   Z_enable_dgcr  r!   dgcZkDGCCounterNamern  r  ZkDGCNRanksNamer  r"   r   r   r   rG  ZkDGCRampUpBeginStepNamerj  rm  r   rj   rk   rl   r   r  r  _v_velocity_acc_strrf   rw   Z	kDGCKNameZkDGCEncodedNameZkDGCGatherNamer   OpRoleBackwardr   r   _is_the_backward_op	all_attrskOpRoleVarAttrNameremover   	_set_attr_remove_attrro  _append_clip_norm_dgc_op)rq   Zparam_and_gradsr  r~  r  u_varv_vark_varencoded_var
gather_varop_makerr  r   var_attrclip_varrO   rO   rP   _append_dgc_ops  s   
	



z$DGCMomentumOptimizer._append_dgc_opsc                 C   B   t j}t jjj}| |jv rt| |  t|krdS dS NTF	r!   r   r  r  r  
attr_namesr   r  kOpRoleAttrNamerq   r   r  r  rO   rO   rP   r  1     
z(DGCMomentumOptimizer._is_the_backward_opc                 C   s|   |||d}t di |}|d u rtd|jdg}|j|j||jdd}|jd|| j	d|t
| jd	d
|id |S )Nr%  max_normrw   dgc_clip_by_norm_op.tmpF)rZ   rw   rf   r   Zdgc_clip_by_norm)r
  r  )r  rt  r   )rZ   r   r   r   )r  )r   r   generate_with_ignorable_keyjoinrw   Zcreate_variablerZ   rf   r   rn  rX   rj  )rq   r%  r  rw   argsrl   r}   rO   rO   rP   _clip_by_norm9  s*   
z"DGCMomentumOptimizer._clip_by_normc                 C   sB   |j j  | j|||jdW  d    S 1 sw   Y  d S )Nr  )r   r   Z_backward_role_guardr  rw   )rq   r  rp  rO   rO   rP   r  S  s   $z&DGCMomentumOptimizer._append_clip_normc	                 C   s   t   }	tj}
| j}| j}|jd ur| |j\}}|	j	d||||| j
| jd||||||d| j| j| jt| jt| jt|t|ddd}|
jj}||
 | ||
 |j|jg d S )Nr  )UVr   r   r  r  )ZU_outZV_outZ
EncodeGradr   r  Z
GatherBuff)mrv  rY  rt  ru  rs  rr  TrP  )r   r	   r   r!   r   rr  rs  r\   rq  r   rn  r  rV  rl  rX  rX   rj  rk  r   r  r  r  r  r  rw   )rq   r~  r  r  r  r  r  r  r  r   r  rr  rs  Zdgc_opr  rO   rO   rP   r  Y  sL   
	
#

zDGCMomentumOptimizer._dgc_opc           	      C   s   |  | t|dd d}| |\}}}g }g }|D ]\}}| ||s-|||f q|||f q| jd ur@| |}nt|}| || j}|| }t|dd d}| 	|}|d urj|| || |S )Nc                 S   r!  r"  r#  r$  rO   rO   rP   rQ     r&  z6DGCMomentumOptimizer.apply_gradients.<locals>.<lambda>r'  c                 S   r!  r"  r#  r$  rO   rO   rP   rQ     r&  )
r  r(  r   r  r   ra   r   r  r`   r   )	rq   r  Ztable_param_and_gradZtable_optimize_opZnot_dgc_params_gradsZdgc_params_gradsrx   r  r)  rO   rO   rP   r*    s2   




z$DGCMomentumOptimizer.apply_gradientsr   )rI   rM   )rk   r7  r8  r9  r  r  ry   rq  r  r   r  r  r  r  r  r  r  r   r   r*  rS  rO   rO   r?  rP   re  	  s2    @8	
%
O
4re  c                       sZ   e Zd ZdZdZ										d fd	d
	Zdd Zdd Zdd Zdd Z	  Z
S )rB   ab  
    Momentum optimizer with LARS support

    The update equations are as follows:

    .. math::

        & local\_learning\_rate = learning\_rate * lars\_coeff * \\
          \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}

        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)

        & param = param - velocity

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element. \
            momentum (float): momentum factor
        lars_coeff (float): Defines how much we trust the layer to change its weights.
        lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.
        exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
        epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
            before updating. Often choose to be `1.0/batch_size`.
        
    Examples:
        .. code-block:: python

            import paddle.fluid as fluid
            import numpy as np

            np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
            inp = fluid.layers.data(
                name="inp", shape=[2, 2], append_batch_size=False)
            out = fluid.layers.fc(inp, size=3)
            out = fluid.layers.reduce_sum(out)
            optimizer = fluid.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
            optimizer.minimize(out)

            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(fluid.default_startup_program())
            exe.run(
                feed={"inp": np_inp},
                fetch_list=[out.name])
    rT  MbP?Mb@?Nr   Fr   c                    s   |d usJ |d usJ t t| j|||||d d| _|| _t|| _t|| _t|
| _|	d u r6g | _	n|	| _	|| _
t|| _i | _d S )Nr:  lars_momentum)r;  rB   ry   rZ   rV  rX   _lars_coeff_lars_weight_decay_epsilon_exclude_from_weight_decayr<  _rescale_gradr=  )rq   rr   rU  
lars_coefflars_weight_decayrs   r`   rt   rw   Zexclude_from_weight_decayepsilonr>  rescale_gradr?  rO   rP   ry     s,   




zLarsMomentumOptimizer.__init__c                 C   r@  rA  rF  rH  rO   rO   rP   rI    rJ  z+LarsMomentumOptimizer._create_master_weightc                 C   s~   | j dur| j d | }| jo|jtjjjk}|r| j|j n|}|j}|| j	vs0|| j	| vr8t
d||| j	| | S )a
  Utility function to fetch an accumulator for a parameter
        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched
        Returns:
            accumulator variable for the parameter
        Nr   r   )rU   r<  rf   r!   r   r   rL  r=  rw   rh   r   r   )rq   rw   rx   rQ  Ztarget_paramtarget_namerO   rO   rP   r   /  s$   

z&LarsMomentumOptimizer._get_accumulatorc                 C   s|   t |tjsJ |D ]1}| jr$|jtjjjkr$| 	|}| 
| j| q
|jtjjjkr4| js4td | 
| j| q
d S )NzAccumulating with FP16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Lars optimizer.)rW   r   rK  r<  rf   r!   r   r   rL  rI  r   r[  r  r   rM  rO   rO   rP   r   D  s   
z*LarsMomentumOptimizer._create_accumulatorsc                 C   sd  t |tjsJ | j}|d j}t| jdkr%| jD ]
}||v r$d} nq| | j|d }| 	|}| j
o?|d jtjjjk}|rJ| j|d j nd }	| j| j|g|| j| jd}
|d |d ||d}|d |d}|rv|	|d< |	|d< t rt|d g|d g|g|g|d g|gd	| jd
| jd|gd|d| jd| j\}}d S |j| j|||
dd}|S )Nr   r   )r^  r  r  r>  r  r  r   r`  rb  rN  rO  r^  r  r  r>  r  r  TrP  )rW   r   rK  r  rw   r   r  r   r[  r   r<  rf   r!   r   r   rL  r=  rV  r  r  r  rV   r)   r  r   rZ   )rq   r   r   r  r   rw   rc  r   rQ  rR  r   r   r   r  Ztmp2rd  rO   rO   rP   r   S  sj   






z)LarsMomentumOptimizer._append_optimize_op)
r  r  NNNNNr   Fr   )rk   r7  r8  r9  r[  ry   rI  r   r   r   rS  rO   rO   r?  rP   rB     s$    <"rB   c                       sB   e Zd ZdZdZ						d fdd	Zdd	 Zd
d Z  ZS )r7   a  
    The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign
    different learning rates to individual parameters.

    The parameter ``param_out`` update rule with gradient ``grad``:

    .. math::

        moment\_out &= moment + grad * grad

        param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}

    Related paper: `Adaptive Subgradient Methods for Online Learning and
    Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.

    The original paper does not have the ``epsilon`` attribute. It is added here
    in our implementation as also proposed `Per-parameter adaptive learning rate
    methods <http://cs231n.github.io/neural-networks-3/#ada>`_
    for numerical stability to avoid the division by zero error.

    Args:
        learning_rate (float|Variable): The learning rate used to update ``Parameter``.
            It can be a float value or a ``Variable`` with a float type.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-06.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
        initial_accumulator_value (float, optional): Initial value for moment accumulator.
            The default value is 0.0.

    Examples:
        .. code-block:: python

            import numpy as np
            import paddle.fluid as fluid

            np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
            inp = fluid.data(name="inp", shape=[2, 2])
            out = fluid.layers.fc(inp, size=3)
            out = fluid.layers.reduce_sum(out)
            optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.2)
            optimizer.minimize(out)

            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(fluid.default_startup_program())
            exe.run(
                feed={"inp": np_inp},
                fetch_list=[out.name])
    momentư>Nr   c                    sH   |d usJ |d usJ t t| j|||||d d| _|| _|| _d S )Nr:  adagrad)r;  r7   ry   rZ   r  initial_accumulator_value)rq   rr   r  rs   r`   rt   rw   r  r?  rO   rP   ry     s   
zAdagradOptimizer.__init__c                 C   s2   t |tjsJ |D ]}| j| j|| jd q
d S )N)r   )rW   r   rK  r   _moment_acc_strr  r\  rO   rO   rP   r     s   z%AdagradOptimizer._create_accumulatorsc              
   C   s   t |tjsJ | | j|d }t r't|d |d || || j	 d S t
 rBt|d |d || ||d |d| j	 d S |j| j|d |d || |d|d |dd| j	idd}|S )Nr   r   r  r   r   Momentr   r   	MomentOutTrP  )rW   r   rK  r   r  r+   r(   Zadagrad_r   r  r*   r)   r  r   rZ   )rq   r   r   
moment_accZ
adagrad_oprO   rO   rP   r     s@   
z$AdagradOptimizer._append_optimize_op)r  NNNNr   	rk   r7  r8  r9  r  ry   r   r   rS  rO   rO   r?  rP   r7     s    >r7   c                       sb   e Zd ZdZdZdZdZdZ						
	
	
	
				d fdd	Zdd Z	dd Z
dd Z  ZS )r8   a!  
    The Adam optimizer uses an optimization described at the end
    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
    it can dynamically adjusts the learning rate of each parameter using
    the 1st moment estimates and the 2nd moment estimates of the gradient.
    
    The parameter ``param_out`` update rule with gradient ``grad``:

    .. math::

        t & = t + 1

        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad

        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad

        learning\_rate & = learning\_rate * \\
                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}

        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}

    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_

    Args:
        learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
            It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
        beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates.
            It should be a float number or a Variable with shape [1] and data type as float32.
            The default value is 0.9.
        beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates.
            It should be a float number or a Variable with shape [1] and data type as float32.
            The default value is 0.999.
        epsilon (float|Tensor, optional): A small float value for numerical stability.
            It should be a float number or a Variable with shape [1] and data type as float32.
            The default value is 1e-08.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
            The accumulators are updated at every step. Every element of the two moving-average
            is updated in both dense mode and sparse mode. If the size of parameter is very large,
            then the update may be very slow. The lazy mode only update the element that has
            gradient in current mini-batch, so it will be much more faster. But this mode has
            different semantics with the original Adam algorithm and may lead to different result.
            The default value is False.
        use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow 
            for whole model instead of creating beta_pow for each parameter. Default is false.
        flatten_param_grads (bool, optional): Whether to flatten all parameters and gradients. Default is false.
        align_size (int, optional): The alignment size when flatten parameters and gradients. Default is -1, which means
            use same align_size as allocator. 

    Examples:
        .. code-block:: python

            import paddle
            import paddle.fluid as fluid

            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = fluid.data(name='x', shape=[None, 13], dtype='float32')
                y = fluid.data(name='y', shape=[None, 1], dtype='float32')
                y_predict = fluid.layers.fc(input=x, size=1, act=None)
                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                avg_cost = fluid.layers.mean(cost)

                adam_optimizer = fluid.optimizer.AdamOptimizer(0.01)
                adam_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(fluid.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)

        .. code-block:: python

            # Adam with beta1/beta2 as Variable
            import paddle
            import paddle.fluid as fluid
            import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler

            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = fluid.data(name='x', shape=[None, 13], dtype='float32')
                y = fluid.data(name='y', shape=[None, 1], dtype='float32')
                y_predict = fluid.layers.fc(input=x, size=1, act=None)
                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                avg_cost = fluid.layers.mean(cost)

                # define beta decay variable
                def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate, epsilon_init):
                    global_step = lr_scheduler._decay_step_counter()

                    beta1 = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(beta1_init),
                        dtype='float32',
                        # set persistable for save checkpoints and resume
                        persistable=True,
                        name="beta1")
                    beta2 = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(beta2_init),
                        dtype='float32',
                        # set persistable for save checkpoints and resume
                        persistable=True,
                        name="beta2")
                    epsilon = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(epsilon_init),
                        dtype='float32',
                        # set persistable for save checkpoints and resume
                        persistable=True,
                        name="epsilon")

                    div_res = global_step / decay_steps
                    decayed_beta1 = beta1_init * (decay_rate**div_res)
                    decayed_beta2 = beta2_init * (decay_rate**div_res)
                    fluid.layers.assign(decayed_beta1, beta1)
                    fluid.layers.assign(decayed_beta2, beta2)

                    return beta1, beta2, epsilon

                beta1, beta2, epsilon = get_decayed_betas(0.9, 0.99, 1e5, 0.9, 1e-8)
                adam_optimizer = fluid.optimizer.AdamOptimizer(
                                                    learning_rate=0.01,
                                                    beta1=beta1,
                                                    beta2=beta2,
                                                    epsilon=epsilon)
                adam_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(fluid.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
    moment1moment2beta1_pow_accbeta2_pow_accr  ?rf  :0yE>NFrI   c              	      sv   |d usJ |d usJ |d usJ |d usJ t t| j|||||||d d| _|| _|| _|| _|	| _|
| _d S )N)rr   rs   r`   rt   ru   rv   rw   adam)	r;  r8   ry   rZ   _beta1_beta2r  
_lazy_mode_use_global_beta_pow)rq   rr   beta1beta2r  rs   r`   rt   rw   	lazy_modeuse_global_beta_powru   rv   r?  rO   rP   ry   	  s*   
zAdamOptimizer.__init__c              	   C   s  t |tjsJ |D ]E}| | j| | | j| | jsO| j| j|t | jt	r*dn| jdgt
jjjdd | j| j|t | jt	rCdn| jdgt
jjjdd q
| jr| j| jt | jt	r_dn| jdgt
jjjdd | j| jt | jt	rwdn| jdgt
jjjdd d S d S )Nr  r   r   )rw   rx   r   r   rZ   r   rf  )rw   r   r   rZ   r   )rW   r   rK  r   _moment1_acc_str_moment2_acc_strr  _beta1_pow_acc_strr  r   r!   r   r   r   _beta2_pow_acc_strr  r   r\  rO   rO   rP   r   	  sL   




z"AdamOptimizer._create_accumulatorsc                 C   s  t |tjsJ | | j|d }| | j|d }| jr*| | j}| | j	}n| | j|d }| | j	|d }| 
|}t rt | jtsN| jn| j d}t | jts_| jn| j d}	d }
t|d |d ||||||
|d |||||
d| jd| jddd|d|	d	| j\}}}}}}d S |d g|d g|g|g|g|g|gd
}| d}|r||d< |d g|g|g|g|gd}| jd| jd}t | jtr| j|d< n| j|d< t | jtr| j|d< n| j|d< t | jtr| j|d< n| j|d< |j| j|||dd}|S )Nr   r   r  r  min_row_size_to_use_multithread  r  r  r  r   r   r   ZMoment1ZMoment2Beta1PowZBeta2Pow	found_infZ
SkipUpdater   Z
Moment1OutZ
Moment2OutZBeta1PowOutZBeta2PowOut)r  r  r  ZBeta1TensorZBeta2TensorZEpsilonTensorTrP  )rW   r   rK  r   r  r  r  r   r  r  r   rV   r  r   r   itemr  r)   r  r  r  r   r   rZ   )rq   r   r   r  r  r  r  r   r  r  rR  r   r   r  r   r   Zadam_oprO   rO   rP   r   	  s   



	



z!AdamOptimizer._append_optimize_opc                 C   s0  t |tjsJ | jr| | j}| | j}|jg o d|i}d|i}i }t | j	t
r>| j	|d< |jd|||dd n| j	|d< |jd|||dd d|i}d|i}i }t | jt
rm| j|d< |jd|||dd n| j|d< |jd|||dd W d   dS W d   dS 1 sw   Y  dS dS )	z3Update beta1_pow and beta2_pow accumulator
        r
  r   Yelementwise_mulTrP  scaleN)rW   r   rK  r  r   r  r  r   r   r  r   r   r  )rq   r   r   r  r  r   r   r   rO   rO   rP   r   M
  sb   



"zAdamOptimizer._finish_update)r  r  rf  r  NNNNFFFrI   )rk   r7  r8  r9  r  r  r  r  ry   r   r   r   rS  rO   rO   r?  rP   r8   	  s.      $Wr8   c                       sV   e Zd ZdZdZdZdZ												d fd
d	Zdd Zdd Z	dd Z
  ZS )r9   a  
    The Adamax optimizer is implemented based on the Adamax Optimization 
    in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
    The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
    which makes the learning rate update algorithm more stable and simple.

    The parameter ``param_out`` update rule with gradient ``grad``:

    .. math::

        t & = t + 1

        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad

        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)

        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}

        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}

    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_

    The original paper does not have an ``epsilon`` attribute,
    it is added here for numerical stability to prevent the division by 0 error.

    Args:
        learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
            It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
            The default value is 0.9.
        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
            The default value is 0.999.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-08.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    **Notes**:
        **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.**

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          # First create the Executor.
          place = fluid.CPUPlace() # fluid.CUDAPlace(0)
          exe = fluid.Executor(place)

          train_program = fluid.Program()
          startup_program = fluid.Program()
          with fluid.program_guard(train_program, startup_program):
              data = fluid.data(name='X', shape=[None, 1], dtype='float32')
              hidden = fluid.layers.fc(input=data, size=10)
              loss = fluid.layers.mean(hidden)
              adam = fluid.optimizer.AdamaxOptimizer(learning_rate=0.2)
              adam.minimize(loss)

          # Run the startup program once and only once.
          exe.run(startup_program)

          x = numpy.random.random(size=(10, 1)).astype('float32')
          outs = exe.run(program=train_program,
                        feed={'X': x},
                         fetch_list=[loss.name])
    r  inf_normr  r  r  rf  r  Nc	           	         sf   |d usJ |d usJ |d usJ |d usJ t t| j|||||d d| _|| _|| _|| _d S )Nr:  adamax)r;  r9   ry   rZ   r  r  r  )	rq   rr   r  r  r  rs   r`   rt   rw   r?  rO   rP   ry   
  s   	
zAdamaxOptimizer.__init__c                 C   sB   |D ]}|  | j| |  | j| | j | j|| jdgd qd S )Nr   )rw   rx   r   r   )r   r  _inf_norm_acc_strr  r  r\  rO   rO   rP   r   
  s   z$AdamaxOptimizer._create_accumulatorsc                 C   s  t |tjsJ | | j|d }| | j|d }| | j|d }t r@t	|d |d | 
||||| j| j| j	 d S t ret|d |d | 
|||||d ||d| jd| jd| j d S |j| j|d |d | 
||||d|d ||d| j| j| jdd	d
}|S )Nr   r   r  r  r  )r   r   r   r  ZInfNormr  )r   r  Z
InfNormOut)r  r  r  TrP  )rW   r   rK  r   r  r  r  r+   r(   Zadamax_r   r  r  r  r*   r)   r  r   rZ   )rq   r   r   r  r  r  Z	adamax_oprO   rO   rP   r   
  sR   



	z#AdamaxOptimizer._append_optimize_opc              
   C   s  t |tjsJ |D ]t\}}|du s|jdu rq
|jj||gU tdA | | j	|}t
 rOt r@t|| jdd}nt|d| j}||d n|jdd|id|id| jidd	 W d   n1 sjw   Y  W d   n1 syw   Y  q
dS )
z'Update Beta1 Power accumulator
        NFZadamxr   Tr  r
  r   rP  )rW   r   rK  r   r   r   r   r   r   r  rV   r+   r(   r  r  r)   Zcopy_r   )rq   r   r   rx   r  r  r  rO   rO   rP   r   #  s>    zAdamaxOptimizer._finish_update)r  r  rf  r  NNNN)rk   r7  r8  r9  r  r  r  ry   r   r   r   rS  rO   rO   r?  rP   r9   
  s"    P
.r9   c                       s4   e Zd ZdZ					d fdd	Zd	d
 Z  ZS )r:   a  
    We implement the Dpsgd optimizer according to CCS16 paper -
    Deep Learning with Differential Privacy.

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          # First create the Executor.
          place = fluid.CPUPlace() # fluid.CUDAPlace(0)
          exe = fluid.Executor(place)

          train_program = fluid.Program()
          startup_program = fluid.Program()
          with fluid.program_guard(train_program, startup_program):
              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
              hidden = fluid.layers.fc(input=data, size=10)
              loss = fluid.layers.mean(hidden)
              optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
              optimizer.minimize(loss)

          # Run the startup program once and only once.
          exe.run(startup_program)

          x = numpy.random.random(size=(10, 1)).astype('float32')
          outs = exe.run(program=train_program,
                        feed={'X': x},
                         fetch_list=[loss.name])

    Args:
        learning_rate (float|Variable): the learning rate used to update parameters. \
        Can be a float value or a Variable with one float value as data element.
        clip (float): clipping threshold
        batch_size (float): batch size.
        sigma (float): for gaussian noise.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
    Notes:
       Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
    r  r  rf  r  Nc                    sh   |d usJ |d usJ |d usJ |d usJ t t| j||d d| _|| _|| _|| _	 d | _d S )N)rr   rs   dpsgd)r;  r:   ry   rZ   _clip_batch_size_sigma_seed)rq   rr   clip
batch_sizesigmars   r?  rO   rP   ry   k  s   
zDpsgdOptimizer.__init__c                 C   s   t |tjsJ | jd krd| _t r3t|d |d | ||d d| jd| j	d| j
d| j d S |j| j|d |d | |dd|d i| j| j	| j
| jd	d
d}|S )Nr   r   r  r  r  seedr   r   )r  r  r  r  TrP  )rW   r   rK  r  rV   r)   r  r   r  r  r  r   rZ   )rq   r   r   Zdpsgd_oprO   rO   rP   r     s0   


z"DpsgdOptimizer._append_optimize_op)r  r  rf  r  N)rk   r7  r8  r9  ry   r   rS  rO   rO   r?  rP   r:   >  s    -r:   c                       sB   e Zd ZdZdZ						d fdd	Zdd	 Zd
d Z  ZS )r;   a  
    The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces
    the decay rate to solve the problem of a sharp drop in the learning rate
    during model training when using the AdagradOptimizer.

    The parameter ``param_out`` update rule with gradient ``grad``:

    .. math::

        moment\_out & = decay * moment + (1 - decay) * grad * grad

        param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}

    Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic
    Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.

    The original paper does not have an ``epsilon`` attribute. It is added here for numerical
    stability to avoid the division by zero error.

    Args:
        learning_rate (float|Variable): The learning rate used to update ``Parameter``.
            It can be a float value or a ``Variable`` with a float type.
        decay (float, optional): The decay rate. The default value is 0.95.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-06.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    **Notes**:
        **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.**

    Examples:
        .. code-block:: python

            import paddle.fluid as fluid

            x = fluid.data( name='x', shape=[None, 10], dtype='float32' )
            trans = fluid.layers.fc( x, 100 )
            cost = fluid.layers.reduce_mean( trans )
            optimizer = fluid.optimizer.DecayedAdagradOptimizer(learning_rate=0.2)
            optimizer.minimize(cost)
    r  ffffff?r  Nc                    sT   |d usJ |d usJ |d usJ t t| j|||||d d| _|| _|| _d S )Nr:  decayed_adagrad)r;  r;   ry   rZ   _decayr  )rq   rr   decayr  rs   r`   rt   rw   r?  rO   rP   ry     s   
z DecayedAdagradOptimizer.__init__c                 C   rZ  rM   )rW   r   rK  r   r  r\  rO   rO   rP   r     r]  z,DecayedAdagradOptimizer._create_accumulatorsc                 C   s   t |tjsJ | | j|d }t r0t|d |d || ||d |d| j	d| j

 d S |j| j|d |d || |d|d |d| j	| j
ddd	}|S )
Nr   r   r  r  r  r  )r  r  TrP  )rW   r   rK  r   r  rV   r)   r  r   r  r  r   rZ   )rq   r   r   r  Zdecayed_adagrad_oprO   rO   rP   r     s8   z+DecayedAdagradOptimizer._append_optimize_op)r  r  NNNNr  rO   rO   r?  rP   r;     s    7r;   c                       sF   e Zd ZdZdZdZ						d fdd	Zd	d
 Zdd Z  Z	S )r?   a  
    **Notes: This API does not support sparse parameter optimization.**

    Adadelta Optimizer. Please refer to this for details:
    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.

    The update is done as follows:

    .. math::

        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2

        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }

        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2

    Args:
        learning_rate (float|Variable): global learning rate.
        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
        rho (float): a floating point value indicating the decay rate. Default 0.95.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            import paddle.fluid as fluid

            image = fluid.data(name='image', shape=[None, 28], dtype='float32')
            fc = fluid.layers.fc(image, size=10)
            cost = fluid.layers.reduce_mean(fc)
            optimizer = fluid.optimizer.Adadelta(
                learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)

            # optimizer_ops is a list of optimizer operators to update parameters
            # params_grads is a list of (param, param_grad), where param is each
            # parameter and param_grad is the gradient variable of param.
            optimizer_ops, params_grads = optimizer.minimize(cost)
    Z_avg_squared_gradZ_avg_squared_updater  r  Nc                    s`   |d u rt d|d u rt d|d u rt dtt| j|||||d d| _|| _|| _d S )Nlearning_rate is not set.epsilon is not set.rho is not set.r:  adadelta)
ValueErrorr;  r?   ry   rZ   r  _rho)rq   rr   r  rhors   r`   rt   rw   r?  rO   rP   ry   Y  s   
zAdadeltaOptimizer.__init__c                 C   >   t |tjs
td|D ]}| | j| | | j| qd S N)block is not instance of framework.Block.)rW   r   rK  rY   r   _avg_squared_grad_acc_str_avg_squared_update_acc_strr\  rO   rO   rP   r   p     z&AdadeltaOptimizer._create_accumulatorsc                 C   s   t |tjs
td| | j|d }| | j|d }t r2t	|d |d ||| j
| j d S t rOt|d |d |||d ||d| jd| j
 d S |j| j|d |d ||d|d ||d| j| j
dd	d
}|S )Nr  r   r   r  r  )r   r   ZAvgSquaredGradZAvgSquaredUpdate)r   ZAvgSquaredGradOutZAvgSquaredUpdateOut)r  r  TrP  )rW   r   rK  rY   r   r	  r
  r+   r(   Z	adadelta_r  r  r*   r)   r  r   rZ   )rq   r   r   Zavg_squared_grad_accZavg_squared_update_accZadadelta_oprO   rO   rP   r   x  sH   


	z%AdadeltaOptimizer._append_optimize_op)r  r  NNNN)
rk   r7  r8  r9  r	  r
  ry   r   r   rS  rO   rO   r?  rP   r?      s    5r?   c                       sN   e Zd ZdZdZdZdZ												d fd
d	Zdd Zdd Z	  Z
S )r<   a  
    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
    rate method. The original slides proposed RMSProp: Slide 29 of
    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .

    The original equation is as follows:

    ..  math::

        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2

        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)

    The first equation calculates moving average of the squared gradient for
    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.

    In some cases, adding a momentum term :math: `\\beta` is beneficial.
    In our implementation, Nesterov momentum is used:

    ..  math::

        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2

        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
            \\epsilon}} \\nabla Q_{i}(w)

        w & = w - v(w, t)

    if centered is True:

    ..  math::

        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2

        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)

        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
            \\epsilon}} \\nabla Q_{i}(w)

        w & = w - v(w, t)

    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
    smoothing term to avoid division by zero, usually set somewhere in range
    from 1e-4 to 1e-8.


    Parameters:
        learning_rate(float): Global learning rate.
        rho(float): rho is :math: `\\rho` in equation, default is 0.95.
        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
            avoid division by zero, default is 1e-6.
        momentum(float): :math:`\\beta` in equation is the momentum term,
            default is 0.0.
        centered(bool): If True, gradients are normalized by the estimated variance of
            the gradient; if False, by the uncentered second moment. Setting this to
            True may help with training, but is slightly more expensive in terms of
            computation and memory. Defaults to False.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

    Raises:
        ValueError: If learning_rate, rho, epsilon, momentum are None.

    Examples:
          .. code-block:: python

            import paddle
            import paddle.fluid as fluid
            import numpy as np

            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
                y_predict = fluid.layers.fc(input=x, size=1, act=None)
                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                avg_cost = fluid.layers.mean(cost)

                rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
                rms_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(fluid.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)

    rU  Zmean_squareZ	mean_gradr  r  r   FNc
           
         s|   t t| j|||||	d |d u rtd|d u rtd|d u r%td|d u r-tdd| _|| _|| _|| _|| _d S )Nr:  r  r  r   zmomentum is not set.rmsprop)	r;  r<   ry   r  rZ   r  r  rV  	_centered)
rq   rr   r  r  rU  centeredrs   r`   rt   rw   r?  rO   rP   ry     s&   

zRMSPropOptimizer.__init__c                 C   sL   t |tjs
td|D ]}| | j| | | j| | | j| qd S r  )rW   r   rK  rY   r   _momentum_acc_str_mean_square_acc_str_mean_grad_acc_strr\  rO   rO   rP   r   6  s   z%RMSPropOptimizer._create_accumulatorsc                 C   s*  t |tjs
td| | j|d }| | j|d }| | j|d }t rCt	
|d ||d || ||| j| j| j| j
 d S t rjt|d || ||d ||d |||d| jd| jd| jd| j d S |j| j|d |d |||| |d|d |||d	| j| j| j| jd
dd}|S )Nr  r   r   r  r  rU  r  )r   r   r  Z
MeanSquareZMeanGradr   )r   r  ZMeanSquareOutZMeanGradOut)r  r  rU  r  TrP  )rW   r   rK  rY   r   r  r  r  r+   r(   Zrmsprop_r   r  r  rV  r  r*   r)   r  r   rZ   )rq   r   r   Zmomentum_accZmean_square_accZmean_grad_accZ
rmsprop_oprO   rO   rP   r   ?  sf   

	z$RMSPropOptimizer._append_optimize_op)r  r  r   FNNNN)rk   r7  r8  r9  r  r  r  ry   r   r   rS  rO   rO   r?  rP   r<     s     j	r<   c                       sH   e Zd ZdZdZdZ							d fdd	Zd	d
 Zdd Z  Z	S )r=   a  
    FTRL (Follow The Regularized Leader) Optimizer.

    The paper that proposed Follow The Regularized Leader (FTRL):
    (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)

    ..  math::

        &new\_accum = squared\_accum + grad^2

        &if (lr\_power == -0.5):

        &\quad  linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}

        &else:

        &\quad   linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}


        &x = l1 * sign(linear\_accum) - linear\_accum

        &if (lr\_power == -0.5):

        &\quad   y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)

        &\quad   pre\_shrink = \\frac{x}{y}

        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)

        &else:

        &\quad   y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)

        &\quad   pre\_shrink = \\frac{x}{y}

        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)

        &squared\_accum += grad^2

    Parameters:
        learning_rate (float|Variable): Global learning rate.
        l1 (float): L1 regularization strength, default is 0.0.
        l2 (float): L2 regularization strength, default is 0.0.
        lr_power (float): Learning Rate Power, default is -0.5.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

    Raises:
        ValueError: If learning_rate, rho, epsilon, momentum are None.

    Examples:
          .. code-block:: python

            import paddle
            import paddle.fluid as fluid
            import numpy as np

            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
                y_predict = fluid.layers.fc(input=x, size=1, act=None)
                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                avg_cost = fluid.layers.mean(cost)

                ftrl_optimizer = fluid.optimizer.Ftrl(learning_rate=0.1)
                ftrl_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(fluid.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)

    NOTE:
       Currently, FtrlOptimizer doesn't support sparse parameter optimization.
    ZsquaredZlinearr   rh  Nc	           	         sF   t t| j|||||d |d u rtdd| _|| _|| _|| _d S )Nr:  r  ftrl)r;  r=   ry   r  rZ   _l1_l2	_lr_power)	rq   rr   l1l2lr_powerrs   r`   rt   rw   r?  rO   rP   ry     s   	
zFtrlOptimizer.__init__c                 C   r  r  )rW   r   rK  rY   r   _squared_acc_str_linear_acc_strr\  rO   rO   rP   r     r  z"FtrlOptimizer._create_accumulatorsc                 C   s   t |tjs
td| | j|d }| | j|d }t r@t	|d |||d | 
||d ||d| jd| jd| j d S |j| j|d |d ||| 
|d|d ||d| j| j| jd	d
d}|S )Nr  r   r   r  r  r  )r   r   ZSquaredAccumulatorZLinearAccumulatorr   )r   ZSquaredAccumOutZLinearAccumOut)r  r  r  TrP  )rW   r   rK  rY   r   r  r  rV   r)   r  r   r  r  r  r   rZ   )rq   r   r   Zsquared_accZ
linear_accZftrl_oprO   rO   rP   r     sD   z!FtrlOptimizer._append_optimize_op)r   r   rh  NNNN)
rk   r7  r8  r9  r  r  ry   r   r   rS  rO   rO   r?  rP   r=   u  s    ^r=   c                       sN   e Zd ZdZdZdZdZdZ						
					d fdd	Zdd Z	  Z
S )rC   a  
    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

    LAMB Optimizer is designed to scale up the batch size of training without losing 
    accuracy, which supports adaptive element-wise updating and accurate layer-wise 
    correction. For more information, please refer to `Large Batch Optimization for 
    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .

    The updating of parameters follows:

    ..  math::

        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t 

        v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2

        m_t &= \\frac{m_t}{\\beta_1^t}

        v_t &= \\frac{v_t}{\\beta_2^t}

        r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}

        w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})


    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the 
    learning rate, :math:`\\lambda` the LAMB weight decay rate.

    Args:
        learning_rate (float|Variable, optional): the learning rate used to update parameters. \
            Can be a float value or a Variable with data type float32. Default 0.001.
        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01.
        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
            Default 0.9.
        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
            Default 0.999.
        epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_fluid_clip_ClipGradByNorm` ,
            :ref:`api_paddle_fluid_clip_ClipGradByValue` ). If you want better convergence, it is recommended
            to use :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
        exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight 
            decay when **exclude_from_weight_decay_fn(parameter)** returns true. 
            Default None.
        name(str|None): For detailed information, please refer to 
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.

    Examples:
        .. code-block:: python
            
            import paddle.fluid as fluid 

            data = fluid.data(name='x', shape=[-1, 5], dtype='float32')
            hidden = fluid.layers.fc(input=data, size=10)
            cost = fluid.layers.mean(hidden)

            def exclude_fn(param):
                return param.name.endswith('.b_0')

            optimizer = fluid.optimizer.Lamb(learning_rate=0.002,
                                             exclude_from_weight_decay_fn=exclude_fn)
            optimizer.minimize(cost)
    r  r  r  r  r  {Gz?r  rf  r  Nc              
      sr   |d usJ |d usJ |d usJ |d usJ |d usJ t t| j||||||||
d d| _|| _|	| _d S )N)rr   rs   r`   rt   r  r  r  rw   lamb)r;  rC   ry   rZ   _weight_decay_exclude_from_weight_decay_fn)rq   rr   Zlamb_weight_decayr  r  r  rs   r`   rt   Zexclude_from_weight_decay_fnrw   r?  rO   rP   ry   p  s"   
zLambOptimizer.__init__c                 C   s6  t |tjsJ d|j_| | j|d }| | j|d }| | j|d }| | j	|d }| j
d ur?| 
|d r?d}n| j}| |}d }	t rrt|d |d ||||||	|d |||||	d| jd| jd| jd| d S |j| j|d |d |||||d	|d ||||d
| j| j| j|ddd}
|
S )NTr   r   r   r  r  r  weight_decayr  r  )r  r  r  r  rP  )rW   r   rK  r   Z	_use_lambr   r  r  r  r  r  r  r   rV   r)   r  r  r  r  r   rZ   )rq   r   r   r  r  r  r  r  r   rR  Zlamb_oprO   rO   rP   r     sl   





z!LambOptimizer._append_optimize_op)
r  r  r  rf  r  NNNNN)rk   r7  r8  r9  r  r  r  r  ry   r   rS  rO   rO   r?  rP   rC   "  s$    HrC   c                       sX   e Zd ZdZ				d fdd	Zdd Zdd	 Zd
d ZedddZ	dd Z
  ZS )r@   a  
	:api_attr: Static Graph

    The ModelAverage optimizer accumulates specific continuous historical parameters
    during training. The accumulated historical range can be controlled by the passed
    ``average_window_rate`` argument. The averaged ``Parameter`` are used in the prediction,
    which usually can improve the accuracy of the prediction.

    Accumulate the average of the ``Parameter`` in the sliding window, the result will be saved
    in a temporary variable, can be applied to the current model's ``Parameter`` by calling
    the ``apply()`` method, and the current model ``Parameter`` can be restored by calling
    the ``restore()`` method.

    The window size for calculating the average is determined by ``average_window_rate``,
    ``min_average_window``, ``max_average_window`` and the current ``Parameter`` update times (num_updates).

    When the cumulative times (num_accumulates) is greater than the specific window
    threshold (average_window), the accumulated ``Parameter`` temporary variable is set to 0.0.
    The following example will help to understand the role of these arguments:

    ::

        if num_accumulates >= min_average_window and num_accumulates >= min(max_average_window, num_updates * average_window_rate):
            num_accumulates = 0

    In the above conditional judgment statement, ``num_accumulates`` indicates the current
    accumulated number, which can be abstractly understood as the length of the cumulative window.
    The length of the window must be at least the length set by the ``min_average_window`` argument,
    and cannot exceed the length specified by the ``max_average_window`` argument or
    ``num_updates * average_window_rate``, where ``num_updates`` indicates the current ``Parameter``
    update times, ``average_window_rate`` is a coefficient that calculates the length of the window.

    Args:
        average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times.
        min_average_window (int, optional): the minimum size of average window length. The default value is 10000.
        max_average_window (int, optional): The maximum size of average window length. The default value is 10000.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Examples:

      .. code-block:: python

        import paddle.fluid as fluid
        import numpy

        # First create the Executor.
        place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
        exe = fluid.Executor(place)

        train_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.program_guard(train_program, startup_program):
            # build net
            data = fluid.data(name='X', shape=[None, 1], dtype='float32')
            hidden = fluid.layers.fc(input=data, size=10)
            loss = fluid.layers.mean(hidden)
            optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
            optimizer.minimize(loss)

            # build ModelAverage optimizer
            model_average = fluid.optimizer.ModelAverage(0.15,
                                                         min_average_window=10000,
                                                         max_average_window=12500)

            exe.run(startup_program)
            for i in range(12500):
                x = numpy.random.random(size=(10, 1)).astype('float32')
                outs = exe.run(program=train_program,
                               feed={'X': x},
                               fetch_list=[loss.name])

            # apply ModelAverage
            with model_average.apply(exe):
                x = numpy.random.random(size=(10, 1)).astype('float32')
                exe.run(program=train_program,
                        feed={'X': x},
                        fetch_list=[loss.name])
    '  Nc           
   
      s  t  rtdtt| jd||d || _|| _|| _g | _	t 
   D ]#}|jdkrJ|jjtd|jdg|jddd}| j	||f q'| j	D ];\}}|d u rWqN|jj||g! td	 | | W d    n1 suw   Y  W d    n1 sw   Y  qNt | _| j }t| jd
 | j	D ]}	| ||	 qW d    n1 sw   Y  t | _| j }t| jd
 | j	D ]}	| ||	 qW d    d S 1 sw   Y  d S )Nz'In dygraph, don't support ModelAverage.r   )r`   rw   Fr  r  Trw   rf   r   r   Zmove_averager  ) r   rV   r   r;  r@   ry   average_windowmin_average_windowmax_average_windowr  r	   r   r-  do_model_averager   r  r   r  r  rw   rf   r   r   r   r   _append_average_accumulate_opr   apply_programr   _add_average_apply_oprestore_program_add_average_restore_op)
rq   Zaverage_window_rater$  r%  r`   rw   rx   r  r   
param_gradr?  rO   rP   ry   4  sf   

 



"zModelAverage.__init__c                 C   s  | |d }| |d }| | d|}| | d|}| | d|}| | d|}| | d|}	| | d|}
tj||d	 tj||	gd
}tj|||gd
}tj|| jd krfdn| jd}tj|| jd krudn| jd}tj|||d d S )Nr   r   sum_1sum_2sum_3num_accumulatesold_num_accumulatesnum_updatesinputoutputr$  r   )r%  rf   )r%  r{  r}   )	_clone_variabler   r   assignr  rB  re   r   Z_elementwise_div)rq   r   r,  rx   r  r-  r.  r/  r0  r1  r2  r  r  rO   rO   rP   r)  b  s0   


z"ModelAverage._add_average_apply_opc                 C   s.   | |d }| |d }tj||d d S )Nr   r   r3  )r6  r   r7  )rq   r   r,  rx   r  rO   rO   rP   r+  y  s   z$ModelAverage._add_average_restore_opc              
   C   s   t d| _| d|}| d|}| d|}| jd|ddgd}| jd	|ddgd}| jd
|ddgd}| jjd|||||||d||||||d| j| j| jddd d S )NZaverage_accumulater-  r.  r/  r0  int64r   )rf   r   r1  r2  Zaverage_accumulates)rx   Zin_sum_1Zin_sum_2Zin_sum_3Zin_num_accumulatesZin_old_num_accumulatesZin_num_updates)Z	out_sum_1Z	out_sum_2Z	out_sum_3Zout_num_accumulatesZout_old_num_accumulatesZout_num_updates)r#  r$  r%  TrP  )r   rl   r   r   r#  r$  r%  )rq   rx   r-  r.  r/  r0  r1  r2  rO   rO   rP   r'  ~  sR   



z*ModelAverage._append_average_accumulate_opTc              	   c   @    | | j zdV  W |r| | dS dS |r| | w w )aE  
        Apply the average of the cumulative ``Parameter`` to the parameters of the current model.

        Args:
            executor(fluid.Executor): The current network executor.
            need_restore(bool): Restore flag variable, if set to True, the network will restore
                the parameters of the network to the default value, if set to False,
                it will not be restored. The default value is True.

        Examples:

          .. code-block:: python

            import paddle.fluid as fluid
            import numpy

            # First create the Executor.
            place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
            exe = fluid.Executor(place)

            train_program = fluid.Program()
            startup_program = fluid.Program()
            with fluid.program_guard(train_program, startup_program):
                # build net
                data = fluid.data(name='X', shape=[None, 1], dtype='float32')
                hidden = fluid.layers.fc(input=data, size=10)
                loss = fluid.layers.mean(hidden)
                optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
                optimizer.minimize(loss)

                # build ModelAverage optimizer
                model_average = fluid.optimizer.ModelAverage(0.15,
                                                            min_average_window=10000,
                                                            max_average_window=12500)

                exe.run(startup_program)
                for i in range(12500):
                    x = numpy.random.random(size=(10, 1)).astype('float32')
                    outs = exe.run(program=train_program,
                                feed={'X': x},
                                fetch_list=[loss.name])

                # apply ModelAverage
                with model_average.apply(exe):
                    x = numpy.random.random(size=(10, 1)).astype('float32')
                    exe.run(program=train_program,
                            feed={'X': x},
                            fetch_list=[loss.name])
        Nrunr(  restorerq   executorZneed_restorerO   rO   rP   apply  s   3zModelAverage.applyc                 C      | | j dS )a  
        Restore ``Parameter`` values of current model.
        
        Args:
            executor(fluid.Executor): The current network executor.

        Examples:

          .. code-block:: python

            import paddle.fluid as fluid
            import numpy

            # First create the Executor.
            place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
            exe = fluid.Executor(place)

            train_program = fluid.Program()
            startup_program = fluid.Program()
            with fluid.program_guard(train_program, startup_program):
                # build net
                data = fluid.data(name='X', shape=[None, 1], dtype='float32')
                hidden = fluid.layers.fc(input=data, size=10)
                loss = fluid.layers.mean(hidden)
                optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
                optimizer.minimize(loss)

                # build ModelAverage optimizer
                model_average = fluid.optimizer.ModelAverage(0.15,
                                                            min_average_window=10000,
                                                            max_average_window=12500)

                exe.run(startup_program)
                for i in range(12500):
                    x = numpy.random.random(size=(10, 1)).astype('float32')
                    outs = exe.run(program=train_program,
                                feed={'X': x},
                                fetch_list=[loss.name])

                # apply ModelAverage
                with model_average.apply(exe, False):
                    x = numpy.random.random(size=(10, 1)).astype('float32')
                    exe.run(program=train_program,
                            feed={'X': x},
                            fetch_list=[loss.name])

                # restore Parameters
                model_average.restore(exe)
        Nr;  r*  rq   r>  rO   rO   rP   r<    s   2zModelAverage.restore)r   r   NNT)rk   r7  r8  r9  ry   r)  r+  r'  r%   r?  r<  rS  rO   rO   r?  rP   r@     s    X.,9r@   c                   @   sP   e Zd ZdZdddZdd Zdd	 Zd
d Zdd Ze	dddZ
dd ZdS )rD   aL  
	:api_attr: Static Graph

    Compute the moving average of parameters with exponential decay.
    Given a parameter :math:`\\theta`, its exponential moving average (EMA)
    will be

    ..  math::

        \\text{EMA}_0 & = 0

	\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t

    The average results calculated by **update()** method will be saved in 
    temporary variables which are created and maintained by the object, and can 
    be applied to parameters of current model by calling **apply()** method. And 
    the **restore()** method is used to restore the parameters.

    **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be 
    zero biased, which can be corrected by divided by a factor 
    :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters 
    when calling **apply()** method would be 

    ..  math::
    
        \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}

    **Decay rate scheduling**. A large decay rate very close to 1 would result 
    in that the averages move very slowly. And a better strategy is to set a 
    relative smaller decay rate in the very beginning. The argument **thres_steps**
    allows users to pass a Variable to schedule the decay rate, in this case, 
    the actual decay rate becomes
     
    ..  math::
    
        \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})

    Usually **thres_steps** can be the global training steps.


    Args:
        decay (float, optional): The exponential decay rate, usually close to 1, such as 0.999, 0.9999, ... . Default 0.999.
        thres_steps (Variable|None, optional): If not `None`, schedule the decay rate. Default None.
        name (str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.


    Examples:

        .. code-block:: python

            import numpy
            import paddle
            import paddle.static as static
            from paddle.static import ExponentialMovingAverage

            paddle.enable_static()

            data = static.data(name='x', shape=[-1, 5], dtype='float32')
            hidden = static.nn.fc(x=data, size=10)
            cost = paddle.mean(hidden)

            test_program = static.default_main_program().clone(for_test=True)
            optimizer = paddle.optimizer.Adam(learning_rate=0.001)
            optimizer.minimize(cost)

            ema = ExponentialMovingAverage(0.999)
            ema.update()

            place = paddle.CPUPlace()
            exe = static.Executor(place)
            exe.run(static.default_startup_program())

            for pass_id in range(3):
                for batch_id in range(6):
                    data = numpy.random.random(size=(10, 5)).astype('float32')
                    exe.run(program=static.default_main_program(),
                    feed={'x': data}, 
                    fetch_list=[cost.name])

                # usage 1
                with ema.apply(exe):
                    data = numpy.random.random(size=(10, 5)).astype('float32')
                    exe.run(program=test_program,
                        feed={'x': data}, 
                        fetch_list=[hidden.name])

                # usage 2
                with ema.apply(exe, need_restore=False):
                    data = numpy.random.random(size=(10, 5)).astype('float32')
                    exe.run(program=test_program,
                        feed={'x': data}, 
                        fetch_list=[hidden.name])
                ema.restore(exe)

    rf  Nc                 C   s  t  rtd|| _|| _|d ur|nd| _|  | _d| _g | _	t
   D ]&}|jdkrO|jjtd| j|j dg|jddd}| j	||f q)i | _| j	D ]:\}}|jj||g% td	 | || j|j< W d    n1 s|w   Y  W d    n1 sw   Y  qVt | _| j }t| jd
 | |\}}| j	D ]r\}}||}||}|| j|j }	t j!||d t j"# F}
|
$|dk t j!||	d|  d W d    n1 sw   Y  |
%  t j!||	d W d    n	1 sw   Y  W d    n	1 sw   Y  qW d    n	1 s)w   Y  t | _&| j& }t| j&d
# | j	D ]\}}||}||}t j!||d qAW d    d S 1 sdw   Y  d S )Nz3In dygraph, don't support ExponentialMovingAverage. z@EMA_STEP_COUNTER@Fr  Zema_tmpTr!  moving_averager"  r3  r   r   )r5  r4  )'r   rV   r   r  _thres_stepsrU   _get_ema_decay
_decay_var_step_counter_name_params_tmpsr	   r   r-  r&  r   r  r   r   r  rw   rf   r   	_ema_varsr   r   r   _create_ema_varsr   r(  r   _get_decay_powr6  r   r7  control_flowSwitchcasedefaultr*  )rq   r  Zthres_stepsrw   rx   r  r   Z	decay_powr{   ZemaswitchrO   rO   rP   ry   z  s   

 







$z!ExponentialMovingAverage.__init__c              
   C   s6  t    tjjdg| jdddd}| jd ur| jd | jd  }tj J}|	|| jk  tj
|| W d    n1 sBw   Y  |  tj
tj| jgtjd| W d    n1 sew   Y  W d    n1 stw   Y  W d    |S W d    |S W d    |S 1 sw   Y  |S )	Nr   r   TZscheduled_ema_decay_rate)r   r   rf   r   rw   r   g      $@)rf   )r	   r   r   r"   r   r  rF  rN  rO  rP  r7  rQ  r   r   r   )rq   	decay_varZdecay_trR  rO   rO   rP   rG    s@   




z'ExponentialMovingAverage._get_ema_decayc                 C   sD   t j| jdgdddd}t |d}|| j}t ||}||fS )Nr   r   r8  Tr   r   )r   r   rI  rB  r6  rH  Zelementwise_pow)rq   r   r{   rS  Zdecay_pow_accrO   rO   rP   rM    s   z'ExponentialMovingAverage._get_decay_powc                 C   s.   t jt| j|j d |jd|jdd}|S )NZ_emar   Tr   )r   r   r   r   rU   rw   r   rf   )rq   rx   	param_emarO   rO   rP   rL    s   z)ExponentialMovingAverage._create_ema_varsc              
   C   s  t j| jd}g }| jD ]b\}}|jj||gM td9 | j|j	 }|j	d | jv r=| j|j	d  }|
||g n|| j |d| j   }t j||d W d   n1 sZw   Y  W d   n1 siw   Y  q|D ]\}}t  jdd|id	|i|j|jd
d qqdS )zm 
        Update Exponential Moving Average. Should only call this method in 
        train program.
        )r  rE  z.masterr   r3  NrB  r
  r   rC  r  )r   Zautoincreased_step_counterrI  rJ  r   r   r   r   rK  rw   r   rH  r7  r	   r   r   rf   )rq   r{   Zparam_master_emasrx   r  rT  Z
master_emaZema_trO   rO   rP   r.    sB   
 
zExponentialMovingAverage.updateTc              	   c   r9  )a  
        Apply moving average to parameters for evaluation.
        
        Args:
            executor (Executor): The Executor to execute applying.
            need_restore (bool, optional): Whether to restore parameters after 
                applying. Default True.
        Nr:  r=  rO   rO   rP   r?    s   
zExponentialMovingAverage.applyc                 C   r@  )zwRestore parameters.
        
        Args:
            executor (Executor): The Executor to execute restoring.
        NrA  rB  rO   rO   rP   r<    s   z ExponentialMovingAverage.restore)rf  NNrC  )rk   r7  r8  r9  ry   rG  rM  rL  r.  r%   r?  r<  rO   rO   rO   rP   rD     s    
`3
rD   c                   @   sx  e Zd ZdZdZddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd[d#d$Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Zd7d8 Z	9	"	"d\d:d;Zd<d= Z	"d[d>d?Z d@dA Z!dBdC Z"dDdE Z#dFdG Z$dHdI Z%dJdK Z&dLdM Z'dNdO Z(dPdQ Z)dRdS Z*dTdU Z+dVdW Z,	"	"	"d]dXdYZ-d"S )^rE   a	  
	:api_attr: Static Graph

    Pipeline Optimizer: Make a program to run as pipeline, that is splitting a
    program into multiple sections (sub-programs) and each section run on a
    device to enable the training of large scale models and the use of
    heterogeneous devices. Meanwhile, all sections run in the stype of pipeline.

    Args:
        optimizer (Optimizer): The optimizer to use, such as SGD.
        num_microbatches (int): Number of microbatches. [Optional. Default:1].
        start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0].
    
    Examples:
        .. code-block:: python

            import paddle.fluid as fluid
            import paddle.fluid.layers as layers

            with fluid.device_guard("gpu:0"):
                x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
                y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
                data_loader = fluid.io.DataLoader.from_generator(
                    feed_list=[x, y],
                    capacity=64,
                    use_double_buffer=True,
                    iterable=False)

                emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
                emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)

            with fluid.device_guard("gpu:1"):
                concat = layers.concat([emb_x, emb_y], axis=1)
                fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
                loss = layers.reduce_mean(fc)
            optimizer = fluid.optimizer.SGD(learning_rate=0.5)
            optimizer = fluid.optimizer.PipelineOptimizer(optimizer)
            optimizer.minimize(loss)

            def train_reader():
                for _ in range(4):
                    x = np.random.random(size=[1]).astype('int64')
                    y = np.random.random(size=[1]).astype('int64')
                    yield x, y
            data_loader.set_sample_generator(train_reader, batch_size=1)

            place = fluid.CUDAPlace(0)
            exe = fluid.Executor(place)
            exe.run(fluid.default_startup_program())
            batch_size = 1
            data_loader.start()
            exe.train_from_dataset(
                    fluid.default_main_program())
            data_loader.reset()
    r   r   c                 C   s  d| _ t rd| _ nt rd| _ t rtdttj	jtj
jjjjf}t||s5td|t||| _| j| _t| jdrM| jj| _t| jdsB|dksUJ d|| _|d	ks`J d
|| _d | _tj}|j| _| | _| | _ |! | _"d | _#g | _$t% | _&d | _'d | _(d S )Nr   npugpuz,In dygraph, don't support PipelineOptimizer.zdThe 'optimizer' parameter for PipelineOptimizer must be an instance of {}, but the given type is {}.	inner_optr   z*num_microbatches must be a positive value.r   z1start_cpu_core_id must be a non-negative integer.))_devicer!   is_compiled_with_npuri  r   rV   r   rH   paddler   ZfluidcontribZmixed_precision	decoratorZOptimizerWithMixedPrecisionrW   r  r   rZ   
_optimizer_origin_optimizerr  rW  _num_microbatches_start_cpu_core_idZ_place_listr   r  _op_roler  _op_role_keyr  _op_role_var_keyr   _op_device_keyro   _pipeline_pairrN   _pp_ring_mapoutput_var_to_opinput_var_to_op)rq   r   num_microbatchesstart_cpu_core_idZvalid_optimizersr  rO   rO   rP   ry   H  sR   








zPipelineOptimizer.__init__c           	      C   sL  |j | }|j d }||}d}|jdkrIt|d }|j|dgdd}|j|d | dd|id	|id
|j	d|j	| j
| jjid |d7 }|j|d | |jdkrWdndd|jdkr`|n|id	|jdkrj|n|id| j| j
| jjddid |d7 }|jdkr|j|d | dd|id	|id
|j	d|j	| j
| jjid |d7 }|S )zj
        Insert allreduce op to sync global information for global
        gradient clip and amp.
        r   
reduce_anyZ_cast_int32r   r|   rw   r   rf   rB  r
  r   rD  rE  r  Zc_allreduce_maxZc_allreduce_sumring_iduse_calc_streamT)r   descoutput_arg_namesr   rZ   r   r   r  
_insert_oprf   rb  ra  Optimizeglobal_ring_id)	rq   op_idxr   r   out_nameZout_varoffsetZtemp_var_nameZtemp_varrO   rO   rP   _insert_allreduce_ops  sZ   



	




	z&PipelineOptimizer._insert_allreduce_opc                 C   s  t  }d}d}|j }||| k rbd}|j| }g }	|jdkr)| |r)d}n|jdkrN| |rN|jdD ]}
||
rE|	|
 q9|j	d|	 n|jdkru|jdD ]}
||
re|	|
 qY|j	d|	 |j
d|	 n]|jd	kr|jdD ]}
||
r|	|
 q|j	d|	 |j
d|	 t|	dkr|| |d
8 }qn&|jdkr| |r|jdD ]}
||
r|	|
 q|j	d|	 d}|j |j  }|D ]a}||v sd|v rq|| |t|rq|t|}|jtjjjkr|j|tjjj|jd}n&t|tr3|j|j|j|j|j|j|j |j!|j"|j#|j$d
}n|%|d}| &|| q|d
7 }| j'sK|sLq| (|d
 |}||7 }||7 }||| k s|)  d S )Nr   Frk  Tconcatr
  update_loss_scalingr   check_finite_and_unscaler   r  Z_blocking_queue)rw   rZ   r   )
rw   r   rf   rZ   r	  r   r   r   r\   
error_clip)*r   ro  op_sizer   rZ   _is_optimize_opr4  Z_find_var_recursiver   Z	set_inputZ
set_outputr   
_remove_op_is_gradient_clip_opr   rp  addr   Z_var_recursiver!   r   r   ZREADERr  r   rW   r   Zcreate_parameterrw   r   rf   r	  r   r   r   r\   r{  r6  _clone_var_attruse_shardingrw  _sync_with_cpp)rq   r   Z	ori_blockZused_var_setZadded_op_numrt  r|  Zshould_insertr   Z
reserved_x
input_namevarsr   Z
source_varZdest_varZinserted_opsrO   rO   rP   _create_vars  s   













KzPipelineOptimizer._create_varsc                 C   s@   | j |jv sJ t|| j }|t| jj@ o|t| jj@ S rM   )rb  r  r   r   ra  r  Loss)rq   r   op_rolerO   rO   rP   _is_loss_grad_op  s
   z"PipelineOptimizer._is_loss_grad_opc                 C   s(   | j |jv ot|| j t| jjkS rM   )rb  r  r   r   ra  Forwardrq   r   rO   rO   rP   _is_forward_op  s
   

z PipelineOptimizer._is_forward_opc                 C   (   | j |jv ot|| j t| jj@ S rM   )rb  r  r   r   ra  r  r  rO   rO   rP   _is_backward_op     z!PipelineOptimizer._is_backward_opc                 C   s,   | j |jv sJ t|| j t| jjkS rM   )rb  r  r   r   ra  r  r  rO   rO   rP   _is_loss_op  s   zPipelineOptimizer._is_loss_opc                 C   r  rM   )rb  r  r   r   ra  rr  r  rO   rO   rP   r}  
  r  z!PipelineOptimizer._is_optimize_opc                 C   s   d|j v od|j v od|j v S )Nr   r   r   )input_namesr  rO   rO   rP   _is_update_op  s   zPipelineOptimizer._is_update_opc                 C   s   t t}|d}|jD ]J}|| j}|| j dkr<|D ]}|| }|j}| j	 }	|	
| |	| jd qq|| }|j}| j	 }	|	
| |	| jd qg }
|D ]}|| }|  |
| q[|
S )a  
        Split a program into sections according to devices that ops run on.
        The op whose op_device attr is "gpu:all" is copied to all sections.

        Args:
            main_program (Program): the main program
            devices: all used devices
        r   :allrD  )r   r   r   r   r   rd  rX  ro  r   r   	copy_fromr  r  r   )rq   r  ZdevicesZdevice_program_mapr   r   r   r   op_descap_opprogram_listr   rO   rO   rP   _split_program  s.   




z PipelineOptimizer._split_programc                 C   s8   d|v sd|v sJ d|d| d }| j| }|S )a  
        For adam optimizer, it will add accumulators and initialize them
        with fill_constant, and force the op device to cpu. Hence, we should
        get the real op_device attribute of the fill_constant as the device
        where the corresponding parameters on.
        r  r  zPFor accumulators for Adam, the name must contain beta1_pow_acc or beta2_pow_acc.r   Z_beta)indexro   )rq   r   r   r   rO   rO   rP   "_get_op_device_for_startup_program8  s   
z4PipelineOptimizer._get_op_device_for_startup_programc                 C   s   |  }t }|jD ]J}|| j}|dkr)|jdksJ d|jd }| |}|r5t|	dd }nd }|r>||kr>q
|j
}	|  j
 }
|
|	 |
| jd q
|  | |  | |S )Nr   r   zcFor ops in startup program with the op_device attribute of cpu, they must be of type fill_constant.r   :r   rD  )r   r   r   r   rd  rZ   rp  r  r   splitro  r   r  r  r  r  )rq   r  Z	device_idr   new_startup_programr   r   
output_varZdevice_indexr  r  rO   rO   rP   _split_startup_programF  s*   



z(PipelineOptimizer._split_startup_programc                 C   sj   d|v r
| dd}d|v r| dd}| j| }|dkrdS d}t|D ]\}}||kr2|} |S q%|S )zM
        Find the post op that has variable named var_name as input.
        z
.cast_fp32rD  
.cast_fp16N)replacerh  reversed)rq   r  r   Zpost_ops	result_oppost_opZpost_idxrO   rO   rP   _find_post_op`  s   
zPipelineOptimizer._find_post_opc                 C   sB   | j | }|dkrdS d}t|D ]\}}||k r|} |S q|S )ze
        Find the previous op of op with index that outputs
        variable named var_name.
        N)rg  r  )rq   r  r   Zprev_opsr  prev_opZprev_idxrO   rO   rP   _find_prev_ops  s   
zPipelineOptimizer._find_prev_opc                 C   s   | || ||| d S rM   )_rename_inputZ_rename_output)rq   r   Zold_namenew_namerO   rO   rP   _rename_arg  s   zPipelineOptimizer._rename_argNc              
   C   sH   |j ||j|du r|jn||j|j|j|j|j d}| 	|| |S )z
        Create a new var for block, which has the same type,
        shape and dtype as ref_var, then rename it with the
        name `name`.
        N)rw   r   rf   rZ   r	  r   is_dataneed_check_feed)
r  r   rf   rZ   r	  r   r  ro  r  r  )rq   r   Zref_varrw   rf   Znew_varrO   rO   rP   _create_var  s   	zPipelineOptimizer._create_varc                 C   s"   |j |_ t|dr|j|_d S d S )Nis_distributed)r   r  r  )rq   destsrcrO   rO   rP   r    s   
z!PipelineOptimizer._clone_var_attrc                 C   s&   | t }|dkr|d| S |S )zD
        Strip the grad suffix from the given variable name
        rI   N)findr!   grad_var_suffix)rq   rw   posrO   rO   rP   _strip_grad_suffix  s   z$PipelineOptimizer._strip_grad_suffixc                 C   s   |t   S )z?
        Append grad suffix to the given variable name
        )r!   r  r   rO   rO   rP   _append_grad_suffix  s   z%PipelineOptimizer._append_grad_suffixc                 C   sL   | | jr|| jnd}|r$|dd dks$|dd dks$J d|S )z6
        Get the op_device attribute of a op.
        Nr      rV  rU  zDNow, only gpu and npu devices are supported in pipeline parallemism.)has_attrrd  r   )rq   r   r   rO   rO   rP   _get_op_device_attr  s   
(z%PipelineOptimizer._get_op_device_attrc                 C   sh  t | jj}|| j|kr|| j| j d dS |jdkrp| 	|rp|j
 D ]
}d|v s4J dq*t|j
 dks@J |j
 d }| ||}|ds[J d|j||| j}|sgJ d	|| j| dS |jd
ksz|jdkr| 	|r| ||j
dd }	|| j|	| j dS |jdkr| |st|jdkrt|jdksJ |jd }
|jd }d|v r| ||}|| j|| j dS | ||j
dd }	|| j|	| j dS | |rPd}|j||  | jr|j||  | js(|d7 }|j||  | jr|j||  | jr|j||  | j}|s:J dt|D ]}|j||  | j| q>dS | |r|jd
kr|d}t|dksjJ | |d }| j| }|| j| dS | |s| |r| j|jv sJ d|| j}t|dksJ d|d }| j| }|jdks|jdks|jdks|jdks|jdkr| j d}|| j| dS |jdks|jdkr|| j| j d || j| jj |jd }||}d|_dS g d}|j|v sJ d||j| |s'J || j| j d dS )a  
        Add op_device attrribute for ops that have not that attribute set.
        We use "gpu:all" to represent the op should be put on all
        sub-programs, such as lr-related ops. Note that: "gpu:all"
        is only used by pipeline as an indicator.
        r  r  z@RENAME@z3The op must be sum used to accumulate renamed vars.r   r   	op_devicez#{} has no op_device attr for var {}z$The post op must have op_device set.rB  r  r
  memcpy@Fetchz1Please put you program within device_guard scope.r   zEgradient_clip and regularization ops must have op_role_var attribute.r&   zHop_role_var for gradient_clip regularization ops must have two elements.sqrtr   Zelementwise_maxZelementwise_divZalloc_float_statusZclear_float_statusT)ry  rk  rx  r  rz  r  zIFor other ops without op_device set, they must be one of {}, but it is {}N) r   ra  LRSchedr   rb  r  rd  rX  rZ   r  ro  r   r   rp  r  r  r   r  r4  r}  r  r   ranger5  r  ro   r  _is_regularization_oprc  r  r   r   )rq   r   r   r   Zlrsched_rolerw   ru  r  r   r  r  Zoutput_namerv  i	grad_namer   op_role_varZfloat_status_nameZfloat_status_varZother_known_opsrO   rO   rP   _add_op_device_attr_for_op  s   












z,PipelineOptimizer._add_op_device_attr_for_opc                 C   sl   t t|jD ],\}}|jdks|jdks|jdkr&|| j| j d q| |r,q| ||| qdS )zf
        Add op_device attrribute for ops in block that have 
        not that attribute set.
        create_py_readerreadZcreate_double_buffer_readerr  N)		enumeraterS   r   rZ   r  rd  rX  r  r  )rq   r   r   r   rO   rO   rP   _add_op_device_attr  s   
z%PipelineOptimizer._add_op_device_attrc                 C   sl  g }t | jjt | jjt | jjt | jjt | jjt | jjt | jjB g}|jD ]}||j	sH|j	dkrD|
| jt | jjksHJ d|| jsXJ d|j	| j|
| j}t ||v snJ d||j	||| js~J d|j	| j|
| j}|sJ d|j	|| j dkrq+|dd }|d	ks|d
ksJ d||vr|| q+|S )z
        Check whether ops in a block have both the op_device and the 
        op_role attributes set.
        Then, return all devices in order.
        Zconditional_blockz`Now, the only supported op without kernel is conditional_block, and its op role must be LRSched.zop ({}) has no {} attribute.z&op_role {} for op {} must be one of {}z/op_device attribute for op {} has not been set.r  r  r   rV  rU  zDNow only gpu and npu devices are supported for pipeline parallelism.)r   ra  r  r  r  r  rr  r   _has_kernelrZ   r   rb  r  r   rd  rX  r  r   )rq   r   device_listZvalid_op_role_valuer   r  r   Zdev_typerO   rO   rP   _check_validation.  sh   







z#PipelineOptimizer._check_validationc                    s  t  d}ttjD ]\r} nqd|dttjD ]\j}|j dkr9q&jD ]			}|j
rGq<d}	}|du r^	jvrYq<j	 }|sj|rh|jnd}|du sv|j dkrwq<||kr|q<	vrg 	< ||f	 v rq<|dd d fdd  	f
dd	t|dd
 t|dd
  q<q&  dS )zp
        Insert a pair of send and recv ops for every two
        consecutive ops on different devices.
        Nr   )r  first_optimize_indexr  r  c                    sr     } }|s|sJ d |r&|| k s$J d||  d S |r5|| ks7J d||  d S d S )Nzfsend/recv in pipeline should only be inserted in forward or backward,please check the op_role of op={}zIn forward, send/recv can only be passed forward, but now prev_stage={} great than cur_stage={}, please check op_device of op={}zIn backward, send/recv can only be passed backward, but now prev_stage={} less than cur_stage={}, please check op_device of op={})r  r  r   )cur_idprev_idZ
is_forwardZis_backward)r   rq   rO   rP   _check_stage  s,   




zKPipelineOptimizer._insert_sendrecv_ops_for_boundaries.<locals>._check_stagec                    sV  t |  }t | }||f	 v rd S | | dkr5| d | | | d  	 ||f d S | | dk rT| d | | | d  	 ||f d S t| | dks^J 	 ||f j}j	 }|| f}|d |  }|jvrj| jj|< j} jd7  _nj| }j	dkrj
d  dd|ij|j|dd	d
dd|id d  d7  < t|j}	|	d dk rԈjn|	d |	d< j
d  dd|gid|	d|jj|j|dd	d
dd|id d  d7  < d S j	dkrt|j}	|	d dk rjn|	d |	d< t|	}
jdko1|
j dk}d|jv rt|jdd dd }|}j
d  dd|gid|gid|	d|jj|j|dd	id d  d7  < d S  | | j
d  dd|gid|gij|j|id d  d7  < |jdd }|}t|trd	nd}j
d  |r|rdndd|ij|j|ddd|d
ddjdjid d  d7  < d }t|tjjkrd }jj}n}jj}j
|d  dd|gid|gij|j|d|id}t|tjjkr4|dd  d  d7  < j
d  |rA|rCdnd!d|gid|	d|jj|j|dd	d
dd|djdji	d d  d7  < |r|sj
d  d"d|gid|gij|j|dd	ddd#jd$jid d  d7  < d S d S d S td% j	)&Nr   rI   r  zF-then-Br  send_v2r
  rn  Tpeerrm  r  rZ   r   r   r   recv_v2r   	out_shaperf   r  rZ   r   r   1F1BZsubprogr7  r  rZ   r   r   r   Zc_sync_calc_stream@FZpartial_sendnumidr  c_sync_comm_streampipeline_flagrD  partial_recvpartial_allgatherr  ZrankzCNow only 'F-then-B' and '1F1B' are supported.The given value is {}.)!r   r   r}  r   rb  r  re  rm  rf  schedule_mode_insert_op_without_syncrd  rS   r   micro_batch_sizerf   r   r  	mp_degreerw   r  r   rW   r   mp_rankr   ra  r  rr  r  r  r  r   )r  r  Zcur_devZprev_devr  r   pairpair_keyrm  Z	var_shapeZnumelZuse_mpZorigin_nameZassociate_varZprefix_nameZ
prefix_varZis_paraminsert_indexZnew_op_roleZsync_comm_op
r  _insert_send_recvr   Zdevice_typeZextra_index_infor  Zinput_var_to_devicer   rq   r   rO   rP   r    s  



















zPPipelineOptimizer._insert_sendrecv_ops_for_boundaries.<locals>._insert_send_recvr   )rN   r  rS   r   r}  r   rd  rX  r   r   r  r  ro   r  r   r  )rq   r   r  Z
cur_devicer   Zprev_devicer  rO   r  rP   #_insert_sendrecv_ops_for_boundariesa  sZ   





 B  sz5PipelineOptimizer._insert_sendrecv_ops_for_boundariesc                 C   s   | j dkrdS tttt|jD ]2\}}| |rD|jdks(J d|j|	ds/J t
|d}|| j  }|d|  dS qdS )zJ
        Scale the loss corresponding to number of micro-batches.
        r   Nr   z8loss_grad_op must be fill_constant op, but this op is {}r   )r_  r  tupler  rS   r   r  rZ   r   r  rX   r   r  )rq   r   r  r   Z
loss_scalerO   rO   rP   _insert_loss_scalek  s   

z$PipelineOptimizer._insert_loss_scalec           
      C   s   t |jD ]=\}}| |sq|j}|j}|| }|jdks#|jdkr$q|D ]}t |vr/q&|t }|d }	| 	|||	 q&qd S )NrB  r  @MERGED)
r  r   r}  r   rp  rZ   r!   r  stripr  )
rq   r   r  r   r  output_namesZin_out_namesrw   r   Znew_grad_namerO   rO   rP   _rename_gradient_var_name{  s   z+PipelineOptimizer._rename_gradient_var_nameFc                 C   s  |r|j nd}|r|jr| |||j|}|S g }d}|rdnd}	|r&tjnd}
tttt	|j
D ]4\}}| |rd|jdkrd|jd }|jd }|d| jv rd|dd	|ks^J || q3| |rq|du rq|d
 }| |rh| j|jv rh|| j}t|dkrq3t|d dksJ tdt|dD ]}d}|| }||sqd|v rq|t  }||	 }||s| ||j| ||
 ||sJ ||}||}d|_|j || di d|gid|j!d|j"dt#d| j$| j%j&j'id |d
7 }||d
  }|j| }d|v }||u}|rG|d }| ||||
}d|_|j || dd|id|id|j"d|j"| j$| j%j(id |d
7 }|}|j || dd||gid|i| j$| j%j(id |d
7 }|)| qq3|sn|S d}tttt	|j
D ]\}}| |r|du r|d
 } nq{|dusJ |D ]M}|dd	}|dd	}||s| ||j| | ||sJ ||}||}d|_|j |dd|id|id|j"d|j"| j$| j%j&id q|S )zz
        Create a new merged gradient for each parameter and accumulate the
        corresponding gradient to it.
        FN@MERGED@FP16r  rB  r   @GRADr  rD  r   r&   
@BroadCastTr   r   r   rf   r   r  	cast_fp16@TMPr
  rD  rE  r  z@FP16z@GRAD@MERGED@FP16)*fp16_allreduceZfuse_grad_merge_accumulate_gradients_with_fuseZfuse_grad_size_in_MBrZ  float16r  r  r  rS   r   r}  rZ   r   rp  r  ro   r  r~  r  rc  r  r   r   r  has_varr!   r  r  r  r   r   rq  r   rf   rX   rb  ra  rr  r  r  r   )rq   r   Zpp_allreduce_in_optimizeZstrategyshardr  Zfused_gradient_namesZmerged_gradient_namesfirst_opt_op_idxmerged_suffixrf   r  r   in_nameru  r  r  rv  r   Zparam_grad_nameZmerged_param_grad_nameZparam_grad_varZmerged_param_grad_varr  r  is_fp16_grad	need_castcast_grad_var_namecast_grad_varfp16_grad_nameZfp16_grad_varrO   rO   rP   _accumulate_gradients  s   	

 
















z'PipelineOptimizer._accumulate_gradientsc           *      C   s  |  ||}g }|rdnd}|rtjntj}d}	d }
|D ]n\}}||}|j|t  | ||jddd}||}t	|drD|j
|_
| |}t|dksZ|	| |ksZ|j|
krk||g|g|gf |j}
d}	q|d	 d | |d	 d
 | |d	 d | |	|7 }	qg }g }|D ]H}|d }|d }|jd|d j|d jddd}|d jtjkrdnd}|d|d j }|j||d jddd}|| || qt|t|ksJ t|t|ksJ d }t|jD ]\}}| |r|d u r|} nq|d usJ d}tt|D ]v}|| }|| }|| d }|| d
 } || d }!|j|| dd| i||dddddddd|d j| j| jjdt dtdid |d
7 }|j|| dd| i|!|ddddddtdddddd|!d j| j| jjjid |d
7 }q||7 }d}tt|D ]a}|| }|| }d|jv }"|"|u}#|#r|jd }$|j|$|ddd}%|j|| dd|id|%id |jd!|%j| j| jjid" |d
7 }|%}|j|| d#d||gid|i| j| jjid" |d
7 }q|rU|D ]R\}}||}|t  d }&||&sJ ||&}'|t  d }(|j|(tj|jddd})|j|| dd|'id|)id tjd!tj| j| jjid" |d
7 }qtt|D ]
}|| j||< q[||fS )$Nr  r  r   TF)rw   rf   r   r   r   r  r   rI   r   r&   zFusedGrad_{}r!  zFusedMergedGrad.cast_fp16.ZFusedMergedGradz_{}r  r  r  Zuser_defined_size_of_dtyper  r  rf   Zset_constantZconstantr  r  r  rB  r
  r   rD  rE  r  r  )_sort_grad_param_by_dtyperZ  r  r   r   r  r!   r  r   r  r  _get_var_sizer   rf   r   r   rw   r  r   r  r  r  rb  ra  r  rY  rX   rr  r  rq  r  )*rq   
main_blockfp16
fused_sizegrad_param_pairsr  Zgrad_param_segmentsr  rf   Zcur_sizeZ
last_dtyper  rx   Z	real_gradZmerged_grad_varZ
real_paramZtmp_sizeZfused_gradientsfused_merged_gradientsZgrad_param_segmentZgrad_segmentZmerged_grad_segmentZ
fused_gradZfused_merged_grad_name_prefixZfused_merged_grad_nameZfused_merged_gradZfirst_back_op_idxr  r   rv  r  ZgradsparamsZmerged_gradsr  r  r  r  r   Z	fp16_gradZfp32_grad_nameZ	fp32_gradrO   rO   rP   &_insert_accumulate_gradients_with_fuse!  sV  








 









z8PipelineOptimizer._insert_accumulate_gradients_with_fusec                 C   s  d }g }t ttt|jD ]\}}| |r?|jdkr?|jd }	|jd }
|
	d| j
v r?|	dd|
ks9J || q| |rV|d u rV|d }|t|jkrV d S | |r| j|jv r|| j}t|dkrnqt|d dksxJ tdt|dD ]}|| }||sqd|v rq|||d  || f qqt|dkrd S |r|jnd}d	d
 t|D }|D ]"}|r||d nd}d|  kr|k sJ  J || | qg }|D ]}| |||||\}}||7 }q|  |S )NrB  r   r  r  rD  r   r&   r  c                 S   s   g | ]}g qS rO   rO   )r   r   rO   rO   rP   r     s    zEPipelineOptimizer._accumulate_gradients_with_fuse.<locals>.<listcomp>)r  r  r  rS   r   r}  rZ   r   rp  r  ro   r  r~  r  r   rc  r  r   r  r  r   Z
worker_numr   r
  r  )rq   r  r  r  r  r  r  r  r   r  ru  r  r  r   r  Zdevice_to_pairsr  Zroot_idZall_fused_merged_gradientspairsr  rO   rO   rP   r    s`   





z1PipelineOptimizer._accumulate_gradients_with_fusec           	      C   sx   g }g }g }|D ]%}| |d j}|tjkr|| q|tjkr(|| q|| q|}|| || |S r"  )r   rf   rZ  r   r   r  extend)	rq   r  r  Z
fp16_pairsZ
fp32_pairsZother_pairsr  rf   Zsorted_pairsrO   rO   rP   r  )  s   



z+PipelineOptimizer._sort_grad_param_by_dtypec                 C   s   t jjjdt jjjdt jjjdt jjjdt jjjdt jjjdt jjj	dt jjj
di}d|jvs1J tdd |j||j  d d S )	Nr&         r   rI   c                 S   ry  rM   rO   rz  rO   rO   rP   rQ   G  r|  z1PipelineOptimizer._get_var_size.<locals>.<lambda>g      @)r!   r   r   rL  rG  ZFP64ZINT16ZINT32ZINT64BOOLZUINT8r   r#   rf   )rq   r   Zdtype_to_sizerO   rO   rP   r  ;  s&   








zPipelineOptimizer._get_var_sizec                 C   s   |j }|D ]E}|djD ]<}|dsq|dj}||}|jdd}|jD ]}	|	j}
|j }|	|
 q)|
  | || |d| qqd S )Nr   Z	sub_block)Z
parent_idx)r   r   r   r  r   r  Z_create_blockro  r   r  r  r  r  )rq   r  r  r  progr   Zorigin_sub_block_idZorigin_sub_blockZnew_sub_blockZsub_opr  r  rO   rO   rP   _add_sub_blocksJ  s"   



z!PipelineOptimizer._add_sub_blocksc                 C   s0   |j D ]}||jsq|| j}|  S d S rM   )r   r  rZ   r   rd  )rq   r   r   r  rO   rO   rP   _get_device_info[  s
   
z"PipelineOptimizer._get_device_infoc                 C   s  t  }|D ]0}|d}|jD ]%}|dkrq||}|jsq||vr'g ||< ||| vr4|| | qqt| D ]}t|| dkrK|	| q<t  }	| D ]P}|| D ]I}|d}|j
D ]>}
|
jdksy|
jdksy|
jdksy|
jdkrzqc|
| jt| jjjkrqc||
j v r||	vsJ d||
||	|<  nqcqYqS| D ]}||	vrq|	| }|d}| |}t|d	d }|| }|D ]}||krq|d}| |}t|d	d }||f}|d
 | }|| jvr| j| | j| j|< | j}|  jd7  _n| j| }|jddd||i| j|dd| j| jjd|d|id |jddd||gid||jd||j| j|dd| j| jjd|d|id |jddd||gid||gi| j|| j| jjd|id qqdS )zu
        Special Case: process persistable vars that exist in
        multiple sections, e.g., shared weight
        r   Zdouble_buffer_0r   r  r  r  ry  z2two sections write the same var({}): second op {}.r  r  r  r
  rn  Fr  rm  r  r   r  rf   r  r  r  N)rN   r   r  r   r   r   rS   keysr   popr   rZ   r   rb  r   ra  rr  r  ro  rp  r   r  r  re  rm  rf  rq  rd  r   rf   )rq   r  Zstartup_progr  Zvar_infor  r   r   r   Z
write_infor   Z
write_progZwrite_blockZwrite_deviceZwrite_dev_indexZ	all_progsZ
read_blockZread_deviceZread_dev_indexr  r  rm  rO   rO   rP   +_process_persistable_vars_in_multi_sectionsa  s   

















z=PipelineOptimizer._process_persistable_vars_in_multi_sectionsc                 C      |j do|j ddS )Nop_namescopez/gradient_clipro  r  r   
startswithr  rO   rO   rP   r       z&PipelineOptimizer._is_gradient_clip_opc                 C   r  )Nr  z/regularizationr  r  rO   rO   rP   r    r  z'PipelineOptimizer._is_regularization_opc                 C   s   |j dod|j dv S )Nr  zweight decay)ro  r  r   r  rO   rO   rP   _is_weight_decay_op  s   z%PipelineOptimizer._is_weight_decay_opc                 C   sh   t t}t t}t|jD ]"\}}|jD ]}|| ||g q|jD ]}|| ||g q#q||fS )z2
        Get info of op input and output.
        )r   rS   r  r   r   r   rp  )rq   r   rg  rh  r  r   r   rO   rO   rP   _get_input_output_info  s   

z(PipelineOptimizer._get_input_output_infoc           
      C   s
  | j dkrdS |d}| jdkrdnd}d}t|jD ]\}}|j|kr.| |r.|} nq|du r5dS d}tt|jD ]@\}}||krH n7|jdkr~|dr~|j	d }|
|}	|j|| d	d
 |d8 }|j|dd|	gid|	gi| j| jjid q>|  dS )zC
        optimize forward send's sync_comm_stream schedule
        r  Nr   r   r  r  r  r  F)syncnopr
  r   r  )r  r   r  r  r   rZ   r  rS   r  r   r   r~  r  rb  ra  r  r  )
rq   r   r   Z	recv_typeZbackward_recv_indexr  r   rv  r   r   rO   rO   rP   _optimize_forward_send_sync  s6   


z-PipelineOptimizer._optimize_forward_send_syncc                 C   s  d}d}|  }t|  j}t|D ]}d}|  j| }t|| j}	|	t| jjkr4|du r4|}|j	dkrI|j	dkrI|j	dkrI|j	dkrIq|	t| jj
kr]||krZ|d7 }q|}n|	t| jjkrq||krn|d7 }q|}ntd|	t }
|jD ]	}|||
|< q~t }|jD ]	}||||< q|j||j	|
|| d	 ||d  |	t| jj
kr|d7 }q|	t| jjkr|d7 }q|  dS )
zc
        A pass to move the recv op to the beginning of
        the forward/backward phase
        r   Nr  r  r  r  r   zUnknown op_role: {}r  )r   r   r   r  r   r   rb  ra  r  rZ   r  r  r   rN   r  r4  r  r5  r  r  r~  r  )rq   r   Zforward_insert_indexZbackward_insert_indexr   Znum_opsr  r  r   r  Z	op_inputsrw   Z
op_outputsrO   rO   rP   _mv_head_recv&  sZ   (


zPipelineOptimizer._mv_head_recvc                 C   s   |  }t }t }|jD ]/}| |r(|jD ]}|j| }|jr&|| qq| |r<|j	D ]}||v r;|| q0qt
|dkrEdS td| dS )z;
        Pipeline may need multiple forward before
        r   Na  The pipeline requires multiple forward calculations before backward, so when the persistable var is changed in the forward, it may cause errors in the backward calculation who using this persistable var. However, some backward op don't need this var(NoNeedBufferVars), there will be no error at this time.
So please check these persistable vars which changed in forward and used in backward:
{})r   r   r   r  rp  r  r   r  r  r   r   r  r   r   )rq   r   r   Zpersist_outputZused_in_backwardr   r   r   rO   rO   rP   _check_pipeline_persist_varV  s,   







z-PipelineOptimizer._check_pipeline_persist_varc                 C   sR  |j }|| _|j}|d u rt }|j}|sJ dg d}|D ]}	|	|v s,J d|	q|d | _|d | _|d | _|d | _	|d | _
|d	 | _|d
 | _|d | _|dd| _| jdkscJ d| j  krp| jk ssJ  J | j||||\}
}| jj| _| |\| _| _| | | |}dd }t|t|d}||ksJ d| | |j}| ||}|D ]
}| | | qt dd rt!t d| _| jt"|k sJ dn	|  jt"|;  _| #|| j  | $|| g }|D ]*}t!|%dd }t&' r|(t&)|d  qt&* r$|(t&+|d  q| ,|| j}d|i|_|| j  }| jsA| -| | j	sW| .| |/  | 0| |/  t&' ret!t dd}nt&* rrt!t dd}| 1|| j  | 2|| j  dd| jt"|| jt"||| j || j |d| j3| j4d|_|
||| j5| j6fS )NzPlease use pipeline with fleet.)
local_rankr  r  rm  rs  r  r  r  z)Please use pipeline with fleet to use {}.r"  r  r  r  rm  rs  r  r  scale_gradientFr   r   c                 S   s@   t | dd }t |dd }||k rdS ||krdS dS )Nr  r   rI   r   )r   r  )Zdevice1Zdevice2Zdev1_idZdev2_idrO   rO   rP   
device_cmp  s   z.PipelineOptimizer.minimize.<locals>.device_cmpr'  z`With pipeline parallelism, you must use gpu devices one after another in the order of their ids.ZPADDLE_MANUAL_PIPELINE_STAGEzTManually specified pipeline stage must be less than total number of pipeline stages.r  r  ZFLAGS_selected_gpus0ZFLAGS_selected_npusZPipelineTrainerZSectionrI   )ZtrainerZdevice_workerZpipeline_stageZnum_pipeline_stagesr  Zinner_parallelismZsection_programr   place_idZ
sync_stepsri  rj  )7r   Zorigin_main_blockr   r
   Z_pipeline_optr   r"  r  r  r  rm  rs  r  r  r   r#  r]  r4  r^  ro   r  rg  rh  r  r  r(  r$   r  r  r  r   osgetenvr   r   r  r  r  r!   ri  r   Z	CUDAPlacerY  ZNPUPlacer  r  r  r  r  r   r!  r_  r`  re  rf  )rq   r  r  rs   r  r  r  Zpipeline_optrequired_keysr   r)  r  r  r$  Zsorted_device_listr  r   Z
place_listdevZ	dev_indexr  Z
real_blockr&  rO   rO   rP   r4  s  s   









 






	






zPipelineOptimizer.minimize)r   r   rM   )FNNr6  ).rk   r7  r8  r9  ry   rw  r  r  r  r  r  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r  r  r  r  r  r  r  r  r   r!  r4  rO   rO   rO   rP   rE     sn    
8+2S&
g3  
  R
;(0rE   c                   @   s   e Zd ZdZdd Zdd Zdd Zejdd	 Z	d
d Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd1d)d*Z	(	(	(	(d2d+d,Zd-d. Z	(	(	(d3d/d0Zd(S )4rG   a	  
	:api_attr: Static Graph

    Recompute Optimizer Wrapper

    Normally, a training step contains three sub-steps: first, run forward
    Operators to calculate the loss; second, run backward Operators to 
    calculate gradient of the parameters; third, apply optimization method
    to update the value of the parameters.

    In the forward computation process, all variables that are needed by 
    backward computation process will be kept in memory, which occupy a great
    amount of memory when the network becomes very deep.

    Recompute split the network to k segments. In each segment, It will 
    recompute the forward Operators, before running backward operators. It is
    very helpful for saving memory.
 
    The Variables that separate a network to segments are called as checkpoints,
    and users should set it manually. The usage is very simple:

    Args:
        optimizer (Optimizer): The optimizer that is applied to parameters.

    Examples:
        .. code-block:: python

            import paddle.fluid as fluid
            import numpy as np
            def gen_data():
                return {"x": np.random.random(size=(32, 32)).astype('float32'),
                "y": np.random.randint(2, size=(32, 1)).astype('int64')}
            def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                print(input_x)
                fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                sum_cost = fluid.layers.reduce_mean(cost)
                return sum_cost, fc_1, prediction
            input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
            input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
            cost, fc_1, pred = mlp(input_x, input_y)

            sgd = fluid.optimizer.Adam(learning_rate=0.01)
            sgd = fluid.optimizer.RecomputeOptimizer(sgd)
            sgd._set_checkpoints([fc_1, pred])
            sgd.minimize(cost)

            print("Finished optimize")
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(fluid.default_startup_program())
            step = 10

            for i in range(step):
                cost_val = exe.run(feed=gen_data(),
                       program=fluid.default_main_program(),
                       fetch_list=[cost.name])
                print("step=%d cost=%f" % (i, cost_val[0]))

    c                 C   s:   t  rtd|| _d | _| jj| _| jj| _d| _d S )Nz-In dygraph, don't support RecomputeOptimizer.F)r   rV   r   r]  _checkpointsrb   rg   enable_offload)rq   r   rO   rO   rP   ry   E  s   


zRecomputeOptimizer.__init__c                 C   sD   t |ts	J d|D ]}t |tjst |tsJ dq|| _dS )zV
        Args:
            checkpoints (list): List of Variable or string    
        z=_checkpoints should be a list of Variable or a list of StringN)rW   rS   sixstring_typesr   r+  )rq   checkpointsckptrO   rO   rP   _set_checkpointsN  s   

z#RecomputeOptimizer._set_checkpointsc                 C   s
   d| _ d S )NT)r,  r   rO   rO   rP   _enable_offload^     
z"RecomputeOptimizer._enable_offloadc                 C   s   t d)a  
	    :api_attr: Static Graph

        load function is not supported by Recompute Optimizer for now.
        :return: None

        Args:
            state_dict: the dict load by load_persistable method

        Examples:
            .. code-block:: python

                import paddle.fluid as fluid
                import paddle.compat as cpt
                
                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                    sum_cost = fluid.layers.reduce_mean(cost)
                    return sum_cost, fc_1, prediction
                
                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                cost, fc_1, pred = mlp(input_x, input_y)
                print("Finished FF")
                
                sgd = fluid.optimizer.Adam(learning_rate=0.01)
                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                sgd._set_checkpoints([fc_1, pred])
                try:
                    state_dict = {}
                    sgd.load(state_dict)
                except NotImplementedError as e:
                    print(cpt.get_exception_message(e))
        z=load function is not supported by Recompute Optimizer for nowr   )rq   r   rO   rO   rP   loada  s   &zRecomputeOptimizer.loadc                 C   s   | j j|dS )a  
        call apply_gradients function of self._optimizer.

        Args:
            params_grads (list): list of (param, grad) pair to do optimization.

        Returns:
            list: A list of operators appended to the current program.

        Examples:
            .. code-block:: python

                import paddle.fluid as fluid
                import paddle.fluid.framework as framework

                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                    sum_cost = fluid.layers.reduce_mean(cost)
                    return sum_cost, fc_1, prediction


                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                cost, fc_1, pred = mlp(input_x, input_y)
                print("Finished FF")

                sgd = fluid.optimizer.Adam(learning_rate=0.01)
                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                sgd._set_checkpoints([fc_1, pred])
                params_grads = sgd.backward(
                    cost,
                    startup_program=None,
                    parameter_list=None,
                    no_grad_set=None)

                program = cost.block.program
                with framework.program_guard(program, None):
                    optimize_ops = sgd.apply_gradients(params_grads)

                print("Finished apply gradients")
        )r  )r]  r*  )rq   r  rO   rO   rP   r*    s   -z"RecomputeOptimizer.apply_gradientsc                 C   sx   t |d }t |d }| j j|| j| j |jddd}| j j|| j| j |jddd}||fS )Nz@Pinnedr  FTrw   r   rf   r   r   )r   r   _main_programr   r  checkpoint_shaper   rf   )rq   varnamepinned_var_nameZfetched_var_name
pinned_varZ	fetch_varrO   rO   rP   _creat_vars  s"   

zRecomputeOptimizer._creat_varsc           	      C   s   d}|  }| j }tj }|D ]3}| j  |}|j|| j	| j  |j
jddd}|jdd|id|jd|jd	d
dd||id qdS )a2  
        add fill_constant_ops to the end of the prog

        we should fill the pinned vars before runing the main_prog
        to instantiate their tensor hold_, which could tell us whether 
        the host memory could hold all the checkpoints from all the 
        GPU devices in this node. 
        r   FTr5  r   r   r   rf   r   r   Z
place_typer&   rZ   r   r   N)r   checkpoint_name2pinned_namevaluesr!   r   r  r6  r   r  r7  rw   rf   r   r   )	rq   r  r  r   Zfill_constant_varsOP_ROLE_KEYr8  r   r:  rO   rO   rP   _append_fill_constant_ops  s.   	


z,RecomputeOptimizer._append_fill_constant_opsc              	   C   sT   t j }| jj|dd| j |gid| j |gidt|||id d S )Nr  r
  r   dst_place_typer  )	r!   r   r  r   r  r6  r   r   r   )rq   Z
insert_idxZsrc_varnameZdst_varnamer  rA  r?  rO   rO   rP   _insert_async_memcpy_op  s   

z*RecomputeOptimizer._insert_async_memcpy_opc                 C   sB   || j v sJ d|| j | }| j| }| |||dd d S )Nz>Try to fetch {} from Pinned Memory, but it is NOT a checkpointr   )r=  r   checkpoint_name2fetch_namerB  )rq   r   r8  pinned_varnameZfetch_varnamerO   rO   rP   _insert_fetch_op  s   

z#RecomputeOptimizer._insert_fetch_opc                 C   s8   || j v sJ d|| j | }| |||dd d S )Nz>Try to offload {} to Pinned Memory, but it is NOT a checkpointr   r&   )r=  r   rB  )rq   r   r8  rD  rO   rO   rP   _insert_offload_op  s
   
z%RecomputeOptimizer._insert_offload_opc                 C      d S rM   rO   )rq   rt  checkpoint_namerO   rO   rP   _insert_sync_op     z"RecomputeOptimizer._insert_sync_opc                 C   sD   t | jdksJ d| jd}td| d|f| j|< |S )Nr   z#Could NOT found checkpoint to fetchrI   zRecord fetch [{}]fetch)r   un_fetch_checkpoint_namesr  r]   debugr   idx2insertionsrq   r   rH  rO   rO   rP   _record_fetch_op  s   z#RecomputeOptimizer._record_fetch_opc                 C   sF   | j d}||ksJ d||td| d|f| j|< d S )Nr   z%expected to offload [{}] but got [{}]zRecord offload [{}]offload)un_offload_checkpoint_namesr  r   r]   rM  rN  )rq   r   rH  Zexpected_checkpoint_namerO   rO   rP   _record_offload_op  s   z%RecomputeOptimizer._record_offload_opc                 C   sF   || j vsJ d|| j | td| d|f| j|< d S )Nz%Try to sync the checkpoint [{}] twicezRecord offload sync [{}]r  )synced_checkpointsr   r  r]   rM  rN  rO  rO   rO   rP   _record_sync_op   s   z"RecomputeOptimizer._record_sync_opc                 C   s  i | _ | jd d  | _| jd | jd d  }i | _| jD ]}d| j|< qt| jj| _t	| jjD ]\}}t
|jddkrF|| _ nq3| jt| jjk sTJ d| | j}d }t	| jj| jd  D ][\}}| j| }|j }|D ]J}	|	|v r|	| jvr| j|	 dkr|}
|	| jd kr| |}|
|	ksJ d|
|	| jj| |	| j|	  | j|	  d7  < qwtd|	qwqgt| jdksJ d| jd S )	NrI   r   r  r   z#Could NOT found backword op in progz6Current recompute segment should use [{}] BUT got [{}]z&use checkpoint [{}] before fetch in BW%{} checkpoints have NOT been Recorded)rN  sorted_checkpoint_namesrL  r  Zcheckpoint_usage_countr   r   r   bw_strart_op_idxr  r   ro  r   rP  r   r   r  rC  r  )rq   Zneed_fetch_checkpoint_namesrH  r   r   Zfetched_checkpoint_varnameZlast_last_fetch_checkpointr  
input_vars	input_varZsecond_to_last_fetch_checkpointrO   rO   rP   _parse_backward'  sp   



z"RecomputeOptimizer._parse_backwardc                 C   s   t | jdkr	d S t | jj}tt| j|D ]7}|| jv rN| j| \}}|dkr<| || t	d
| | j|= q|dkrN| || t	d
| q| j  t | jdksjJ d
dd | j D d S )	Nr   rK  zInsert [{}] fetch op.r  zSync [{}] fetch op.z{} checkpoints left un-Fecthedc                 S      g | ]}|d  qS r   rO   r   ZelerO   rO   rP   r   s      z7RecomputeOptimizer._update_backward.<locals>.<listcomp>)r   rN  r   r   r  r  rX  rE  r]   rM  r   rI  r  r>  )rq   Ztotal_oprt  	operationrH  rO   rO   rP   _update_backwarda  s4   


z#RecomputeOptimizer._update_backwardc                 C   s  i | _ | jd d  | _| jd}| jd d  }i | _| jD ]
}ddd| j|< qt | _t| jj	| _
t| jj	D ]\}}t|jddkrM|| _
 nq:| j
t| jj	k s[J dd }t| jj	| j
| j D ]\}}| j
| }|j }|j }	|D ]}
|
|v rt|dksJ d|
||
| jv r|d kr| j| d dkr| || n| j| d	 }|dksJ d
|| |d | | |d |
 |
}ntd|
|
|kr)t|dksJ d|
||| jd ksJ d|| jd || j| d	 dkr| || q~| j| d	 }|dks!J d
|| |d | q~|	D ]'}||v rR|| jvs@J d|| j| d  d7  < || j| d	< q,qit| jdkseJ d| jt| jt|ks}J dt|t| j d S )NrI   r   )countr   r  z"Could NOT found Forward op in progr   zJchekpoint should be the only Output of a certain op, but [{}] is from [{}]ra  r   z5last_usage_idx of checkpoint [{}] should large than 0z7There should be just ONE op that output checkpoint [{}]zJthe last offload chekpoint before [{}] is suppose to be [{}], but got [{}]zcheckpoint [{}] used after syncrV  )rN  rW  rR  r  Zcheckpoint_usage_count_and_idxr   rT  r   r   r   fw_strart_op_idxr  r   ro  r   rX  rp  r   r   rU  rS  r  rL  )rq   Zlast_checkpointZneed_offload_checkpoint_namesrH  r   r   Zlast_offload_checkpointr  Zoutput_varsrY  r  Zlast_usage_idxrZ  rO   rO   rP   _parse_forwardu  s  











z!RecomputeOptimizer._parse_forwardc                 C   s   t | jdkr	d S tt| j| jD ];}|| jv rM| j| \}}|dkr7| || td	| | j|= q|dkrM| 
|| td	| | j|= q| j  t | jdksiJ d	dd | j D d S )	Nr   rQ  zInsert [{}] offload op.r  zInsert [{}] offload_sync op.z {} checkpoints left un-Offloadedc                 S   r\  r   rO   r]  rO   rO   rP   r     r^  z6RecomputeOptimizer._update_forward.<locals>.<listcomp>)r   rN  r  r  rc  rX  rF  r]   rM  r   rI  r   r  r>  )rq   rt  r_  rH  rO   rO   rP   _update_forward  s8   


z"RecomputeOptimizer._update_forwardc                 C   rG  rM   rO   r   rO   rO   rP   _check_offload_fetch  rJ  z'RecomputeOptimizer._check_offload_fetchNc                 C   s  |j j| _|j | _ |dkrtj }t| j|b t| jdks(J d	| jt
dd | jD s:J d	| jt | _t | _| jD ]}| |\}}|| j|< || j|< qE| | |   |   |   |   |   W d   dS 1 s}w   Y  dS )z
        core steps for recompute offload
        1. create pinned vars and temp vars 
        2. parse & update Forward pass: offload, sync
        3. parse & update Backward pass: rename, fetch, sync
        4. verify the correctness
        Nr   zFcheckpoints shape {} should be an non empty list like: [12, 512, 1024]c                 S   s   g | ]}|d kqS r   rO   r]  rO   rO   rP   r     r^  z/RecomputeOptimizer._offload.<locals>.<listcomp>zLall ele in checkpoints shape {} should be a determined integer larger than 0)r   r   r6  rZ  Zstaticr
   r   r   r7  r   allrN   r=  rC  rW  r;  r@  r[  r`  rd  re  rf  )rq   r  r  Zcheckpoint_varnamer9  Zfetch_var_namerO   rO   rP   _offload  sJ   





zRecomputeOptimizer._offloadc                 C   s   | j dus	J dt rtd|j| _|jj}t||= g }| j D ]}t	|t
r1|| q$||j| q$t|dkrLt||||d\}	}
nt||||d}	W d   n1 s^w   Y  | jrp|
| _| j||d |	S )a8  
        call append_backward with checkpoints.

        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables or Variable.names to update.
            no_grad_set (set|None): set of Variables or Variables.names should be ignored.
            callbacks (list|None): list of callables to run when appending backward
                operator for one parameter.
            checkpoints (list): list of Variables as checkpoints

        Examples:
            .. code-block:: python

                import paddle.fluid as fluid
    
                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                    sum_cost = fluid.layers.reduce_mean(cost)
                    return sum_cost, fc_1, prediction
    
    
                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                cost, fc_1, pred = mlp(input_x, input_y)
                print("Finished FF")
    
                sgd = fluid.optimizer.Adam(learning_rate=0.01)
                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                sgd._set_checkpoints([fc_1, pred])
                params_grads = sgd.backward(
                    cost,
                    startup_program=None,
                    parameter_list=None,
                    no_grad_set=None)
                print("Finished backward")
        N&You should call _set_checkpoints first*DyGraph current does not support recomputer   )r/  r  )r+  r   rV   r   rf   re   r   r   r   rW   r   r   r   r   r   r,  rW  rh  )rq   r  r  rs   r  r  r   Zcheckpoint_varsr0  r  rW  rO   rO   rP   r    sF   /

zRecomputeOptimizer.backwardc                 C   s*   t | jdr
| jjn| jj}||||dS )a  
        call the apply_optimize function of self._optimizer
        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            params_grads (list): list of (param, grad) pair to do optimization.
        Examples:
            .. code-block:: python
                import paddle.fluid as fluid
                
                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                    sum_cost = fluid.layers.reduce_mean(cost)
                    return sum_cost, fc_1, prediction                
                
                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                cost, fc_1, pred = mlp(input_x, input_y)
                print("Finished FF")
                
                sgd = fluid.optimizer.Adam(learning_rate=0.01)
                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                sgd._set_checkpoints([fc_1, pred])
                params_grads = sgd.backward(
                    cost,
                    startup_program=None,
                    parameter_list=None,
                    no_grad_set=None)
                
                optimize_ops = sgd.apply_optimize(
                    cost, startup_program=None, params_grads=params_grads)
                
                print("Finished apply_optimize")
        r+  r2  )r  r]  r+  Z_apply_optimize)rq   r  r  r  funcrO   rO   rP   r+  m  s   'z!RecomputeOptimizer.apply_optimizec                 C   s^   t |ts	J d| jd usJ dt rtd| j||||d}| j|||d}||fS )Nr0  ri  rj  r1  r2  )rW   r   r+  r   rV   r   r  r+  r3  rO   rO   rP   r4    s(   zRecomputeOptimizer.minimizerM   r5  r6  )rk   r7  r8  r9  ry   r1  r2  r   Zdeprecate_stat_dictr4  r*  r;  r@  rB  rE  rF  rI  rP  rS  rU  r[  r`  rd  re  rf  rh  r  r+  r4  rO   rO   rO   rP   rG     sB    >	
(/"	:`
,
S0rG   c                   @   s$   e Zd ZdZd	ddZd
ddZdS )rF   a  
	:api_attr: Static Graph

    This implements the Lookahead optimizer of the
    paper : https://arxiv.org/abs/1907.08610.

    Lookahead keeps two sets of params: the fast_params and
    the slow_params. inner_optimizer update fast_params every 
    training step. Lookahead updates the slow_params and fast_params 
    every k training steps as follows:

    .. math::
        
        slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
	
	fast\_param_t &=  slow\_param_t

    Args:
        inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
        alpha (float): The learning rate of Lookahead.
        k (int): The slow params is updated every k steps.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.fluid as fluid
            import numpy as np
            import numpy.random as random

            paddle.enable_static()
        
            x = fluid.layers.data(name='x', shape=[2], dtype='float32')
            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
            y = fluid.layers.fc(input=[x], size=2, act="softmax")
            loss = fluid.layers.cross_entropy(input=y, label=label)
            loss = paddle.mean(x=loss)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
                                                alpha=0.5,
                                                k=5)
            optimizer.minimize(loss)
            main_program = fluid.default_main_program()
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(fluid.default_startup_program())

            def train_reader(limit=5):
                for i in range(limit):
                    yield random.random([2]).astype('float32'), random.random([1]).astype('int64')
            
            feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
            reader = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=50000),batch_size=1)
            
            for batch_data in reader():
                exe.run(fluid.default_main_program(),
                feed=feeder.feed(batch_data))

          ?   c                 C   sz   t  rtd|d usJ dd|  krdks"J d J dt|tr+|dks/J d|| _|| _|| _d| _d S )	Nz-In dygraph, don't support LookaheadOptimizer.inner optimizer can not be Noner   r   zBalpha should be larger or equal to 0.0, and less or equal than 1.0r   zk should be a positive integerZ	lookahead)	r   rV   r   rW   r   inner_optimizeralphar   rZ   )rq   rp  rq  r   rO   rO   rP   ry     s   
zLookaheadOptimizer.__init__Nc                 C   s  | j j||d}|d u rt }|j}dd | D }i }|D ]}||}|d us,J |j|d |j|jdd}	|	||< q|	 }
|D ]&}|
|}|d usQJ |
j|d |j|jdd}	|
j
dd|id	|	id
 qDt|j| tjddgt| jddd}tjddgt| jddd}tjddgtdddd}tj|ddd tjdgddd}tjdgddd}t||}tj }|||k |D ]}||}|| }	tj||	d qW d    n1 sw   Y  |||k6 |D ]+}||}|| }	tt||t|	t||}tj||	d tj||d qW d    n	1 s/w   Y  |  W d    n	1 sDw   Y  W d    n1 sTw   Y  W d    |S W d    |S 1 smw   Y  |S )Nrk  c                 S   s   g | ]}|j qS rO   r#  r,  rO   rO   rP   r   
  s    z/LookaheadOptimizer.minimize.<locals>.<listcomp>z@SLOWTrw   r   rf   r   r7  r
  r   r   Zlookahead_kr   r|   r   Zlookahead_alphar   Zlookahead_stepr   r   r%  r   Zin_placer   r   rf   r   r3  )rp  r4  r
   r   r-  r   r  r   rf   r   r   r   r   r   r   r   r   r   rX   rq  r  r   elementwise_modrN  rO  rP  r7  elementwise_addr  Zelementwise_subrQ  )rq   r  r  Zmini_outr  r	  Zparam_to_slowrx   Zfast_varZslow_varstartup_blockr   rq  r   zero_varZone_varmodrR  r   Ztmp_varrO   rO   rP   r4    s   









 
4
44zLookaheadOptimizer.minimize)rm  rn  rM   )rk   r7  r8  r9  ry   r4  rO   rO   rO   rP   rF     s    
<rF   c                   @   s   e Zd ZdZdZdddZdd Zd	d
 Z				dddZdd Z	dd Z
dd Zdd Zdd Zdd Z			dddZdS )GradientMergeOptimizera  
    Gradient Merge, also called as Gradient Accumulation,
    is a training strategy for larger batches. With this strategy,
    the parameter will not be updated until specific steps.

    For each step, the forward network and the backward network
    will run to calculate the gradient of the parameters.

    For every k step, the optimization network will run,
    applying a specific optimization method (such as SGD, Adam)
    to the parameters.

    Args:
        inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam)
            which update the parameters
        k_steps (int): the update period of the parameters
        avg (bool): whether to average the gradients of each mini-batch,
            the default value is `True`

    Examples:
        .. code-block:: python

        import paddle.fluid as fluid
        import numpy as np

        def gen_data(batch_size):
            return {"x": np.random.random(size=(batch_size, 32)).astype('float32'),
                    "y": np.random.random(size=(batch_size, 1)).astype('int64')}

        def mlp(input_x, input_y, hid_dim=128, label_dim=2):
            fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
            prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
            sum_cost = fluid.layers.reduce_mean(cost)
            return sum_cost, fc_1, prediction

        input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
        input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
        cost, fc_1, pred = mlp(input_x, input_y)
        sgd = fluid.optimizer.Adam(learning_rate=0.01)
        sgd = fluid.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True)
        sgd.minimize(cost)

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())

        for i in range(10):
            cost_val = exe.run(feed=gen_data(32),
                       program=fluid.default_main_program(),
                       fetch_list=[cost.name])
            print("step=%d, cost=%f" % (i, cost_val[0]))
    Zgrad_merge_cond_namer   Tc                 C   s\   t  rtd|d usJ dt|tr|dksJ d|| _|| _d| _|| _d | _	d S )NzIn dygraph, we don't support GradientMergeOptimizer.You can do Gradient merge by yourself with k-times forward + backward, and one-time optimizer.minimize()ro  r   z$k_steps should be a positive integerZgradient_merge)
r   rV   r   rW   r   rp  k_stepsrZ   avg_optimize_ops)rq   rp  r{  r|  rO   rO   rP   ry     s   

zGradientMergeOptimizer.__init__c                 C   
   || _ d S rM   )r{  )rq   r{  rO   rO   rP   _set_k_steps  r3  z#GradientMergeOptimizer._set_k_stepsc                 C   r~  rM   )r|  )rq   r|  rO   rO   rP   _set_avg  r3  zGradientMergeOptimizer._set_avgNc                 C   sF   t |ts	J d|d u sJ d|d u sJ d| jj||d}|S )Nr0  zCThe parameter_list should be None when using GradientMergeOptimizerz@The no_grad_set should be None when using GradientMergeOptimizerrk  )rW   r   rp  r  )rq   r  r  rs   r  r  r  rO   rO   rP   r    s   

zGradientMergeOptimizer.backwardc                 C   sB   |j j}t|| | |}W d    |S 1 sw   Y  |S rM   )r   r   r   r*  )rq   r  r  r  r   r)  rO   rO   rP   r+    s   
z%GradientMergeOptimizer.apply_optimizec                 C   r  r  r  r  rO   rO   rP   r    r  z*GradientMergeOptimizer._is_the_backward_opc                 C   s   t j}|j}| |sJ d||j|j}| |  }|j|v s-J d|j||j|v s;J d|j||	|j |	|j t
|dkrW|| | d S ||  d S )Nz<grad.op={} is not the backward op which produces the grad={}zBwhen using GradientMergeOptimizer, param={} must be in var_attr={}zAwhen using GradientMergeOptimizer, grad={} must be in var_attr={}r   )r!   r   r   r  r   rw   r   r  r  r  r   r  r  )rq   rx   r  r  r   r   r  rO   rO   rP   _remove_op_role_var  s*   


z*GradientMergeOptimizer._remove_op_role_varc                 C   sP   ||_ tj}|jj}|| j|j || | ||	 |j|jg d S rM   )
r   r!   r   r  r  r  GRAD_MERGE_COND_NAMErw   r  r  )rq   r   rx   r  condr  r  rO   rO   rP   _add_gm_op_role_var  s   z*GradientMergeOptimizer._add_gm_op_role_varc                 C   s   t jddgt| jdddd}t jddgtddddd}t jddgtddddd}|jd	dgd
d}td/ t j|ddd |jd||dd|idddd |jd||dd|id W d    |S 1 sjw   Y  |S )NZgradient_merge_kr   r|   T)rw   r   r   rf   r   r  Zgradient_merge_zeror   Zgradient_merge_stepZgradient_merge_condrW  rl  r   r   rs  ru  r
  r  r   rI   FZaxisZ
use_mkldnnr  equalr   )r   r   r   r{  r  r   r  r   )rq   r  Z
k_step_varrx  Zstep_varZcond_varrO   rO   rP   _get_gm_cond_var  s\   

z'GradientMergeOptimizer._get_gm_cond_varc                    sf  t   t }  }| }|}|D ]\}}|jtjjjks%J d	|| qdd |D }|
 }	i }
g |D ]b\}}|j}||}|d usOJ |j|d |j|jdd}||
|< |j|d |j|jdd}|jdd|i|j|jtd	d
d |jd||dd|idddd}|||| ||g q= fdd}tj||d d jS )Nz@SELECTED_ROWS is not supported in GradientMergeOptimizer for nowc                 S   s   i | ]\}}|j |qS rO   r#  )r   r   r   rO   rO   rP   
<dictcomp>6  r  z:GradientMergeOptimizer.apply_gradients.<locals>.<dictcomp>z@GRAD@GradientMergeTrr  r   r   r   rt  r<  rv  r  rI   Fr  r  c               	      s    j }   }||  tj}jr8D ]#\}}|jdd|id|idj dddd |j	|
 |jj qD ]\}}||_q:j_D ]\}}tj|j|jd|d	 |j	|
 |jj qKd S )
Nr  r
  r   r   r   F)r  ZbiasZbias_after_scaler  )r   rf   r   r}   )Zcurrent_block_idxr   Z_set_forward_block_idxr!   r   r|  r   r{  r   r  r  r  r  r   rp  r*  r}  r   r   r   rf   rr  )Zcur_block_idxZ	cur_blockr  rx   r  r  Znew_params_gradsrq   rO   rP   true_apply_gradiente  s@   
zCGradientMergeOptimizer.apply_gradients.<locals>.true_apply_gradient)Ztrue_fnZfalse_fn)r	   r
   r   r  rZ   r!   r   r   r  r  r  rw   r   r  r   rf   r   rX   r  r   r   r  r}  )rq   r  r  r  rw  r  rx   r  Zparam_to_gradZparam_namesZparam_to_gradient_merger   r~  Zgradient_merge_varZstartup_gradient_merge_varZnew_grad_opr  rO   r  rP   r*  %  sp   

	
*z&GradientMergeOptimizer.apply_gradientsc                 C   s<   t |ts	J d| j||||d}| j|||d}||fS )Nr0  r1  r2  )rW   r   r  r+  r3  rO   rO   rP   r4    s   zGradientMergeOptimizer.minimize)r   Tr5  r6  )rk   r7  r8  r9  r  ry   r  r  r  r+  r  r  r  r  r*  r4  rO   rO   rO   rP   rz  Z  s*    6

4prz  )d
__future__r   r   r   r-  r'  r]   collectionsr   rZ  Z$paddle.fluid.distribute_lookup_tabler   Zpaddle.fluid.frameworkr   r   r   r   r	   r
   r   rD  r   r   r   r  r   r   r   r   r  r   r   r   r   r   r   r   r   Zlayer_helperr   r   Zdygraphr   r   r   Zdygraph.learning_rate_schedulerr   r    Zpaddle.fluidr!   Zpaddle.fluid.layersr"   	functoolsr#   r$   wrapped_decoratorr%   r'   Zcptr  r(   r)   Zfluid.frameworkr*   r+   r,   __all__objectrH   r5   r6   re  rB   r7   r8   r9   r:   r;   r?   r<   r=   rC   r-   r.   r/   r0   r1   r2   r3   r>   ZRMSPropr4   rA   ZLambr@   rD   rE   rG   rF   rz  rO   rO   rO   rP   <module>   s   $         4     0 Y   r @hz 
 M . .  > w                   1 )