o
    Me                     @   s  d dl mZ d dlZd dlZd dlZd dlZd dlZddlmZ ddlm	Z	 ddlm
Z
 ddlmZ ddlmZ dd	lmZ dd
l	mZmZmZ ddlmZ ddl	mZ d dlmZmZ g dZdadd Zdd Zdd ZG dd deZG dd deZ dd Z!G dd deZ"G dd de"Z#G d d! d!e"Z$da%d"d# Z&G d$d% d%e"Z'e	j(d,d&d'Z)d(d) Z*d*d+ Z+e"Z,e#Z-e$Z.e'Z/dS )-    )print_functionN   )layers)	framework)core)
name_scope)base)check_variable_and_dtype)_non_static_modein_dygraph_mode_in_legacy_dygraph)LayerHelper)default_main_program)_C_ops_legacy_C_ops)set_gradient_clipErrorClipByValueClipGradByValueClipGradByNormClipGradByGlobalNormFc                  G   sB   t | dksJ t | dkrt| d tsJ t}| d a|S tS )Nr   r   )len
isinstancebool'_clip_by_global_norm_using_mp_type_flagargs	old_value r   AD:\Projects\ConvertPro\env\Lib\site-packages\paddle/fluid/clip.py"_clip_by_global_norm_using_mp_type*   s   r   c                 C   s:   | j tjjjks| j tjjjkrt r| tjjjS | S N)	dtyper   VarDescVarTypeFP16BF16r   astypeFP32xr   r   r   _cast_to_mp_type_if_enabled6   s   r*   c                 C   s   t | } t s| jtjjjks| jtjjjkr$t	| }t
|}|S t r,t| S t r4t| S d}t| dddg| t|fi t }|| j}d| i}d|i}|j|||d |S )z:
    This OP returns the squared L2 norm of a tensor.
    squared_l2_normr)   float32float64XOuttypeinputsoutputs)r*   r   Zis_compiled_with_xpur!   r"   r#   r$   r%   r   squareZ
reduce_sumr   r   r+   r   r   r	   r   localsZ"create_variable_for_type_inference	append_op)r)   r4   
sum_squareZop_typehelperoutr2   r3   r   r   r   _squared_l2_norm?   s$    



r:   c                   @   s   e Zd Zdd Zdd ZdS )BaseErrorClipAttrc                 C      t  r    NotImplementedErrorselfr   r   r   __str__]      zBaseErrorClipAttr.__str__c                 C   r<   r    r=   )r@   block	grad_namer   r   r   _append_clip_op`   rB   z!BaseErrorClipAttr._append_clip_opN)__name__
__module____qualname__rA   rE   r   r   r   r   r;   [   s    r;   c                   @   s*   e Zd ZdZd	ddZdd Zdd ZdS )
r   a  
    Clips tensor values to the range [min, max].

    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
        will be set to ``-max`` by framework.

    Examples:
        .. code-block:: python

            import paddle.fluid as fluid
            BATCH_SIZE = 128
            CLIP_MAX = 2e-6
            CLIP_MIN = -1e-6
            prog = fluid.framework.Program()
            with fluid.program_guard(main_program=prog):
                image = fluid.layers.data(
                    name='x', shape=[784], dtype='float32')
                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
                predict = fluid.layers.fc(
                    input=hidden2, size=10, act='softmax')
                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
                cost = fluid.layers.cross_entropy(input=predict, label=label)
                avg_cost = fluid.layers.mean(cost)
            prog_clip = prog.clone()
            prog_clip.block(0).var(hidden1.name)._set_error_clip(
                fluid.clip.ErrorClipByValue(
                    max=CLIP_MAX, min=CLIP_MIN)
    Nc                 C   s0   t |}|d u r| }nt |}|| _|| _d S r    )floatmaxminr@   rJ   rK   r   r   r   __init__   s   
zErrorClipByValue.__init__c                 C      d| j | jf S )NzByValue, min=%f, max=%frK   rJ   r?   r   r   r   rA         zErrorClipByValue.__str__c                 C   sP   |j  }|d |d|g |d|g |d| j |d| j d S )Nclipr.   r/   rK   rJ   )descr6   set_typeZ	set_inputZ
set_output	_set_attrrK   rJ   )r@   rC   rD   Zclip_op_descr   r   r   rE      s   

z ErrorClipByValue._append_clip_opr    )rF   rG   rH   __doc__rM   rA   rE   r   r   r   r   r   d   s
    
&	r   c                    s   | | j | j  d } fdd| D D ]&}|  | }t|dd }|d u s4t|ts4td|d ur>|	| | qd S )Nr   c                    s   g | ]}| v r|qS r   r   ).0nZgrad_to_varr   r   
<listcomp>   s    z'error_clip_callback.<locals>.<listcomp>
error_clipzIVariable's error_clip should be an instance of BaseErrorClipAttr or None.)
rR   opZop_sizeZoutput_arg_namesZ_var_recursivegetattrr   r;   	TypeErrorrE   )rC   contextZop_descZgrad_nZfwd_varrZ   r   rX   r   error_clip_callback   s   r_   c                       sR   e Zd Z fddZdd Zejdd Zdd Zd	d
 Z	dd Z
dd Z  ZS )ClipGradBasec                    s   t t|   d S r    )superr`   rM   r?   	__class__r   r   rM      s   zClipGradBase.__init__c                 C   r<   r    r=   r?   r   r   r   rA      rB   zClipGradBase.__str__c                 C      t r    r=   r@   params_gradsr   r   r   _dygraph_clip   s   zClipGradBase._dygraph_clipc                 C   rd   r    r=   re   r   r   r   _static_clip      zClipGradBase._static_clipc                 C   sH   t  r	| |S |D ]\}}t|dd d urtd  nq| |S )Ngradient_clip_attrz'set_gradient_clip' will be ineffective, because you have set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' is redundant and you can remove it.)r   r
   rg   r\   warningswarnrh   )r@   rf   pgr   r   r   __call__   s   

zClipGradBase.__call__c                 C   r<   r    r=   r@   r^   paramgradr   r   r   _process_context   rB   zClipGradBase._process_contextc                 C   r<   r    r=   )r@   rq   rr   r   r   r   _create_operators   rB   zClipGradBase._create_operators)rF   rG   rH   rM   rA   imperative_baseno_gradrg   rh   ro   rs   rt   __classcell__r   r   rb   r   r`      s    
r`   c                       sP   e Zd ZdZd fdd	Zdd Zejdd Zd	d
 Z	dd Z
dd Z  ZS )r   a  
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
    
    - Any values less than min are set to ``min``.
    
    - Any values greater than max are set to ``max``.

    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. 
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
    
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
    (for example: :ref:`api_paddle_optimizer_SGD`).

    Note:
        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. 
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
    
    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` 
            automatically. In this case, ``max`` must be greater than 0.

    Examples:
        .. code-block:: python
        
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            linear = paddle.nn.Linear(in_features=10, out_features=10, 
                                      weight_attr=paddle.ParamAttr(need_clip=True), 
                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
    Nc                    s@   t t|   |d u r|dksJ | }t|| _t|| _d S )Ng        )ra   r   rM   rI   rJ   rK   rL   rb   r   r   rM      s   
zClipGradByValue.__init__c                 C   rN   )Nz(Clip Gradient By Value, min = %f, max=%frO   r?   r   r   r   rA     rP   zClipGradByValue.__str__c                 C   sb   g }|D ]*\}}|d u rqt |dddu r|||f qtj|| j| jd}|||f q|S )N	need_clipTFr)   rK   rJ   )r\   appendpaddlerQ   rK   rJ   r@   rf   params_and_gradsrm   rn   new_gradr   r   r   rg     s   zClipGradByValue._dygraph_clipc              
   C   s   g }t  }tdT |D ]I\}}|d u rqt|dddu r&|||f q|jj||g tj	|| j
| jd}W d    n1 sDw   Y  |||f |j||j< qW d    n1 saw   Y  t|| |S )Ngradient_cliprx   TFry   )dictr   r   r\   rz   rC   program_optimized_guardr   rQ   rK   rJ   name_correct_clip_op_role_varr@   rf   r}   param_new_grad_name_dictrm   rn   r~   r   r   r   rh     s$   
zClipGradByValue._static_clipc                 C      d S r    r   rp   r   r   r   rs   &  ri   z ClipGradByValue._process_contextc                 C   s   t j|| j| jd}||fS )Nry   )r   rQ   rK   rJ   r@   rq   rr   r~   r   r   r   rt   )  s   z!ClipGradByValue._create_operatorsr    rF   rG   rH   rU   rM   rA   ru   rv   rg   rh   rs   rt   rw   r   r   rb   r   r      s    (
r   c                       sN   e Zd ZdZ fddZdd Zejdd Zdd	 Z	d
d Z
dd Z  ZS )r   a  
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
    
    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
    
    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
    
    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
    
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
    (for example: :ref:`api_paddle_optimizer_SGD`).
    
    The clipping formula is:

    .. math::
        Out =
        \left\{
            \begin{array}{ccl}
                X & & if (norm(X) \leq clip\_norm) \\
                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
        \end{array}
        \right.


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

    .. math::
        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}

    Note:
        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

    Args:
        clip_norm(float): The maximum norm value.

    Examples:
        .. code-block:: python
        
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            linear = paddle.nn.Linear(in_features=10, out_features=10, 
                                      weight_attr=paddle.ParamAttr(need_clip=True), 
                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
    c                    s   t t|   t|| _d S r    )ra   r   rM   rI   	clip_norm)r@   r   rb   r   r   rM   f  s   zClipGradByNorm.__init__c                 C   
   d| j  S )Nz#Gradient Clip By Norm, clip_norm=%fr   r?   r   r   r   rA   j     
zClipGradByNorm.__str__c                 C   s^   g }|D ](\}}|d u rqt |dddu r|||f qtj|| jd}|||f q|S )Nrx   TFr)   Zmax_norm)r\   rz   r   clip_by_normr   r|   r   r   r   rg   m  s   zClipGradByNorm._dygraph_clipc              
   C   s   g }t dU t }|D ]G\}}|d u rqt|dddu r&|||f q|jj||g tj	|| j
d}W d    n1 sBw   Y  |j||j< |||f qW d    n1 s_w   Y  t|| |S )Nr   rx   TFr   )r   r   r   r\   rz   rC   r   r   r   r   r   r   r   r   r   r   r   rh   z  s$   
zClipGradByNorm._static_clipc                 C   r   r    r   rp   r   r   r   rs     ri   zClipGradByNorm._process_contextc                 C   s   t j|| jd}||fS )Nr   )r   r   r   r   r   r   r   rt     s   z ClipGradByNorm._create_operatorsr   r   r   rb   r   r   .  s    7
r   c                  G   s>   t | dkrtS t | dkrt| d tsJ t}| d a|S )Nr   r   )r   &_allow_pure_fp16_global_norm_clip_flagr   r   r   r   r   r   !_allow_pure_fp16_global_norm_clip  s   r   c                       sT   e Zd ZdZ		d fdd	Zdd Zejdd	 Zd
d Z	dd Z
dd Z  ZS )r   a  
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
    :math:`t\_list` , and limit it to ``clip_norm`` .
    
    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
    
    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
    
    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
    
    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::

        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

    Args:
        clip_norm (float): The maximum norm value.
        group_name (str, optional): The group name for this clip. Default value is ``default_group``.

    Examples:
        .. code-block:: python
        
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            linear = paddle.nn.Linear(in_features=10, out_features=10, 
                                      weight_attr=paddle.ParamAttr(need_clip=True), 
                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
            loss.backward()

            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            sdg.step()
    default_groupFc                    s6   t t|   t|| _|| _t|tsJ || _d S r    )	ra   r   rM   rI   r   
group_namer   r   auto_skip_clip)r@   r   r   r   rb   r   r   rM     s
   

zClipGradByGlobalNorm.__init__c                 C   r   )Nz+Gradient Clip By GlobalNorm, global_norm=%fr   r?   r   r   r   rA     r   zClipGradByGlobalNorm.__str__c                 C   s  g }g }g }g }|D ]d\}}|d u rq
t |dddu rq
|}t r/| r/t|}| }n|jtjj	j
krAt|}t|}t|}	|	jtjj	jksU|	jtjj	jkr[||	 q
|	jtjj	jkri||	 q
||	 q
t|t| t| dkr|S t|dkrdnd}
g }t|dkrt|}|||
 t|dkrt|}|
dkr|| n|||
 t|dkrt|}|| t|}t|}tjdg|j| jd}d}| jsd}tj|tj||d	d	}n||krd}tj||d	}|D ]F\}}|d u rqt |dddu r!|||f q|rB|j|jkr1||jn|}t||}|||f q|||f q|S )
Nrx   TFr   r-   r,   r   shaper!   valuer)   y)r\   r   Zis_selected_rowsr   merge_selected_rowsZ_get_tensor_from_selected_rowsr1   r   r"   r#   SELECTED_ROWSget_tensor_from_selected_rowsr:   r!   r$   r%   rz   r'   r   r{   Zadd_nr&   sqrtfill_constantr   r   elementwise_divelementwise_maxelementwise_mul)r@   rf   r}   sum_square_listsum_square_list_fp16sum_square_list_fp32rm   rn   
merge_gradr7   	sum_dtypeglobal_norm_varglobal_norm_var_fp16global_norm_var_fp32Zglobal_norm_var_fp64max_global_normrx   clip_varZ
clip_inputr~   r   r   r   rg     s   



 







z"ClipGradByGlobalNorm._dygraph_clipc              
   C   s  g }g }g }g }t d |D ]d\}}|d u rqt|dddu r#q|}|jj||g? |jtjj	j
krAt|}t|}t|}	|	jtjj	jkrS||	 n|	jtjj	jkra||	 n||	 W d    n1 spw   Y  qt|t| t| dkr|W  d    S |jj||g t|dkrdnd}
g }t|dkrt|}|s|st s|||
 n|| t|dkrt|}|
dkr|| n|||
 t|dkrt|}|| t|dkrt|n|d }tj|d	}tjdg|j| jd
}tj|tj||dd}W d    n	1 s'w   Y  t }|D ]\}}|d u r<q1t|dddu rN|||f q1|jj||gN t|}|jtjj	jkrs|jtjj	jkrs|dn|}t  }|j d||dd|id ||ur|j dd|id|i|j|jdd W d    n	1 sw   Y  |j!||j!< |||f q1W d    n	1 sw   Y  t"|| |S )Nr   rx   TFr   r-   r,   r   r(   r   r   Zfloat16r   r.   Yr/   r0   castr.   )Zin_dtypeZ	out_dtype)r1   r2   r3   attrs)#r   r   r\   rC   r   r   r1   r   r"   r#   r   r   r   r   r:   r!   r$   rz   r'   r   sumsr   r&   r   r   r   r   r   r   r*   r   Zcurrent_blockr6   r   r   )r@   rf   r}   r   r   r   rm   rn   r   r7   r   r   r   r   Zglobal_norm_var_other_dtyper   Z	scale_varr   Znew_gZscale_inputrC   r   r   r   rh   8  s   






#




bz!ClipGradByGlobalNorm._static_clipc                 C   s   | j |vr#g || j < | j|| j d < tjdg|j| jd|| j d < n| j|| j d  ks1td|}|jtjj	j
krEt|}t|}t|}|| j  | || _d S )NZ_clip_valuer   r   _clipz>All parameters' 'clip_norm' of a same group should be the same)r   r   r   r   r!   
ValueErrorr1   r   r"   r#   r   r   r   r:   rz   r^   )r@   r^   rq   rr   r   Zlocal_norm_varr   r   r   rs     s"   




z%ClipGradByGlobalNorm._process_contextc                 C   s   | j d }|| jvr:tj| j| j  d}tj|d}| j| j d  }tj|tj||dd}|jdks5J || j|< |jj	d|| j| dd	|id
 ||fS )NZ_scale)inputr(   r   r   )r   r   r   r/   r0   )
r   r^   r   r   r   r   r   r   rC   r6   )r@   rq   rr   Zgroup_scale_nameZgroup_norm_varr   Zgroup_scale_varr   r   r   rt     s(   


z&ClipGradByGlobalNorm._create_operators)r   Fr   r   r   rb   r   r     s    6

Sjr   c                    s   t d t| tstd du rt   djD ]}d|	 v r2d|
dv r2t d  nq|du r> d }tdd	 |D rP fd
d|D }tdd	 |D s]td|D ]}t| |_q_dS )a  
    :api_attr: Static Graph
    
    Warning:
    
        This API must be used after building network, and before ``minimize`` , 
        and it may be removed in future releases, so it is not recommended. 
        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
        this is a better method to clip gradient. There are three clipping strategies:
         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
         :ref:`api_fluid_clip_GradientClipByValue` .
        
    To specify parameters that require gradient clip.

    Args:
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no 
            gradient clipping.
        param_list (list(Variable), optional): Parameters that require gradient clip.
                It can be a list of parameter or a list of parameter's name.
                Default None, meaning that all parameters in the program will be included.
        program (Program, optional): The program where parameters are located.
                Default None, meaning that using :ref:`api_fluid_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python

            import paddle.fluid as fluid

            def network():
                image = fluid.data(name='image', shape=[
                                   None, 28], dtype='float32')
                param_attr1 = fluid.ParamAttr("fc1_param")
                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
                param_attr2 = fluid.ParamAttr("fc2_param")
                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
                loss = fluid.layers.reduce_mean(fc2)
                return loss


            # network 1: clip all parameter gradient
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 2: clip parameter gradient by name
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=["fc1_param", "fc2_param"])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)

            # network 3: clip parameter gradient by value
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
                fluid.clip.set_gradient_clip(
                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
                    param_list=[param_var1, param_var2])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)
            
            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
                clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
                clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0)
                # Set the gradient clipping strategy: clip1
                fluid.clip.set_gradient_clip(clip1)
                # Set the gradient clipping strategy: clip2
                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
                sgd.minimize(loss)
                # 'set_gradient_clip' will not take effect when setting has a conflict, 
                # and the gradient clipping strategy will be 'clip2'
            
            
    zCaution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documention of 'optimizer'.z<'clip' should be an instance of ClipGradBase's derived classNr   op_namescopeZ	optimizerz'minimize' has been invoked before, this will make 'set_gradient_clip' be ineffective! Please invoke 'set_gradient_clip' before 'minimize'.c                 s       | ]	}t |tjV  qd S r    )r   sixstring_typesrV   elemr   r   r   	<genexpr>B      z$set_gradient_clip.<locals>.<genexpr>c                    s   g | ]
}  d |qS )r   )rC   varr   r   r   r   rY   C  s    z%set_gradient_clip.<locals>.<listcomp>c                 s   r   r    )r   r   	Parameterr   r   r   r   r   D  r   zK'param_list' should be a list of Parameter or basestring(parameter's name).)rk   rl   r   r`   r]   r   r   rC   opsZ	all_attrsattrZall_parametersallcopydeepcopyrj   )rQ   Z
param_listr   r[   rq   r   r   r   r     s6   
Z
r   c           	   
   C   s  t  }| D ]d\}}|d u rq|jj||gJ td5 t|dd }|d u r:| W  d    W  d      S t|tsCt	d|j
|||d W d    n1 sUw   Y  W d    n1 sdw   Y  qg }t  }| D ]M\}}|d u rzqq|jj||g3 td |j||d\}}|j||j< |||g W d    n1 sw   Y  W d    n1 sw   Y  qqt|| |S )Nr   rj   z8clip attribute should be an instance of GradientClipBase)r^   rq   rr   )rq   rr   )r   rC   r   r   r   r   r\   r   r`   r]   rs   rt   r   rz   r   )	Zparam_gradsr^   rm   rn   Z	clip_attrresr   rq   r~   r   r   r   append_gradient_clip_opsM  sR   "
  
r   c           	      C   s   g }t |dkr
d S | D ]I\}}|d u rq|jj}||v rq|| |jj jD ]*}|drTd|dv rT|drT|dd }||v rT||| g}|	d| q*qd S )Nr   r   r   Zop_role_var)
r   rC   idxrz   r   Zglobal_blockr   Zhas_attrr   rT   )	rf   r   Zblock_id_listrq   rr   Zblock_idr[   
param_nameZcorrect_p_gr   r   r   r   o  s0   
r   )NN)0
__future__r   r   r   rk   	functoolsr{    r   r   r   r   Zdygraphr   ru   Zdata_feederr	   r
   r   r   Zlayer_helperr   r   r   r   __all__r   r   r*   r:   objectr;   r   r_   r`   r   r   r   r   r   Zdygraph_not_supportr   r   r   ZGradientClipBaseZGradientClipByValueZGradientClipByNormZGradientClipByGlobalNormr   r   r   r   <module>   sP   		<#[f  0|"