o
    Me                    @   s  d dl mZ ddlmZ d dlmZ d dlmZ ddlmZ d dl	Z	d dl
Z
d dlZd dlZdd	lmZ dd
lmZ ddlmZ d dlZddlmZ d dlZzd dlmZ W n
   d dl	mZ Y ddgZejeejddZG dd deZdd Z	dbddZdbddZ dd Z!dcddZ"dd Z#d d! Z$d"d# Z%d$d% Z&d&d' Z'd(d) Z(d*d+ Z)	,ddd-d.Z*	,ddd/d0Z+		dcd1d2Z,dg fd3d4Z-d5d6 Z.d7d8 Z/	dbd9d:Z0	dbd;d<Z1d=d> Z2d?d@ Z3						dedAdBZ4dCdD Z5dEdF Z6dGdH Z7dIdJ Z8dKdL Z9dMdN Z:dOdP Z;ej<					dfdQdZ=dRdS Z>dTdU Z?dVdW Z@dXdY ZA		Zdgd[d\ZBdcd]d^ZCej<dcd_dZDej<dcd`daZEdS )h    )print_function   )framework_pb2)	framework)program_guard)coreN   )compat)unique_name)
log_helper)
check_type)Sequenceappend_backward	gradientsz&%(asctime)s-%(levelname)s: %(message)s)fmtc                   @   sT   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )ProgramStatsc                 C   s   || _ || _i | _i | _d S N)blockopsop_depsvar_op_deps)selfr   r    r   ED:\Projects\ConvertPro\env\Lib\site-packages\paddle/fluid/backward.py__init__0   s   
zProgramStats.__init__c                 C   s   g }| j D ]%}t| j | d dkr*t| j | d dkr*| j|jr%q|| q| jD ]}|j dkr?|	|j
  q.|S )Nvar_as_output_opsr   var_as_input_opsread)r   lenr   varpersistableappendr   desctypeextendoutput_arg_names)r   input_namesnameopr   r   r   get_input_nodes6   s   


zProgramStats.get_input_nodesc                 C   s2   g }| j D ]}|j dkr||j  q|S )Nseed)r   r"   r#   r$   r%   )r   var_namer(   r   r   r   get_reserved_varsC   s   
zProgramStats.get_reserved_varsc                 C   s   g }t ||dD ]H}| j| j D ]}|| jv r,| j| d D ]}||kr+|| q q| j| j D ]}|| jv rO| j| d D ]}||k rN|| qCq5q|S )Nr   r   r   )ranger   r"   r%   r   r!   input_arg_names)r   Zbegin_op_idxZ
end_op_idxr+   ir'   idxr   r   r   get_out_of_subgraph_varsJ   s    



z%ProgramStats.get_out_of_subgraph_varsc                 C   s   t | j}d}|D ]}|| jvrd||f  S q	|D ]}|| jvr(d||f  S q|D ]}| j| d }|D ]}t||}q6q+|D ]}| j| d }|D ]}t||}qLqA||kr^d||fS d||fS )NFr   r   T)r   r   r   minmax)r   Z
var_group1Z
var_group2Z
min_op_idx
max_op_idxr'   op_idxr0   r   r   r   is_subgraphY   s.   




zProgramStats.is_subgraphc                    s|    fdd}|d }|}||kr<| j | r5td j | j  j | j d  |}|d8 }n	 |S ||ks|S )zZ
        persist vars of amp-related cast should be included in recompute segment
        c                    s&   | j  dko j| j  d jS )Ncastr   )r"   r#   r   r   r.   r    )r(   r   r   r   is_amp_castw   s
   z7ProgramStats._update_segment_start.<locals>.is_amp_castr   zfound amp-cast op: {}, : {}r   )r   _loggerinfoformatr"   r#   r.   )r   min_idxpre_segment_end_idxr:   Zidx_Zupdated_min_idxr   r9   r   _update_segment_startr   s   

z"ProgramStats._update_segment_startc                 C   sV  t | jD ]\}}g g d| j|< t |j D ]\}}|| jv r0| j| d | j| d  qt |j D ])\}}|| jv rM| j| d |g q8i | j|< |g| j| d< g | j| d< q8t |j D ])\}}|| jv r~| j| d |g qii | j|< g | j| d< |g| j| d< qi| j| d D ]}| j| d |g qqd S )N)in_opsout_opsrA   r   r   rB   )	enumerater   r   r"   r.   r   r$   r%   )r   r/   r(   jr'   r6   r   r   r   build_stats   s0   




zProgramStats.build_statsc                 C   s   g }|D ].}|| j vrtd|  q| j | d g kr$||df q||t| j | d f qt|dd d}dd |D S )	Nz[Recompute Optimizer: deleted %s from checkpoints, because it is not used in paddle program.r   r2   c                 S   s   | d S )Nr   r   xr   r   r   <lambda>   s    z/ProgramStats.sort_checkpoints.<locals>.<lambda>)keyc                 S   s   g | ]}|d  qS )r   r   .0rG   r   r   r   
<listcomp>   s    z1ProgramStats.sort_checkpoints.<locals>.<listcomp>)r   r;   r<   r!   r4   sorted)r   checkpoints_nameZsorted_checkpointsr'   r   r   r   sort_checkpoints   s   
zProgramStats.sort_checkpointsc              	   C   s~  dd | j D }d|vrd S d}|t| j k r| j | }|j dkr(|d7 }q|dd ur=t|ddkr=|d7 }qtd}td|d	g}| j	j
|d
tjjjddd}|ddu rcdnt|d}tj }d}	|j|r}|j|}	| j	j|jdi d|gi||	ddd}
| j ||
 |jd|g |jd |jd | j	  |d7 }|t| j k sd S d S )Nc                 S   s   g | ]}|j  qS r   )r"   r#   )rK   r(   r   r   r   rL      s    zBProgramStats.modify_forward_desc_for_recompute.<locals>.<listcomp>Zdropoutr   r   ZSeedr*   .tmpZint32F)r'   dtyper#   r    stop_gradientZfix_seed OutT)r*   	op_device	force_cpu)indexr#   inputsoutputsattrsr   )r   r   r"   r#   inputr
   generateZgenerate_with_ignorable_keyjoinr   
create_varr   VarDescVarTypeZ
LOD_TENSORattrintop_proto_and_checker_makerkOpDeviceAttrNamehas_attrZ
_insert_opr0   insert	set_inputZremove_attr_sync_with_cpp)r   Zop_typesr6   r(   Zop_unique_nameZvar_unique_nameZ	added_varr*   op_device_attr_namerV   Zadded_opr   r   r   !modify_forward_desc_for_recompute   sX   
 
 

	
z.ProgramStats.modify_forward_desc_for_recomputeN)__name__
__module____qualname__r   r)   r,   r1   r7   r@   rE   rO   rk   r   r   r   r   r   .   s    r   c              
   C   s@   d|d t |  |d d|  |d d|  f }|S )Nz3%s	name:[%s]
%s    	inputs:[%s]
%s    	outputs:[%s]Z_op_input _output)strr#   r^   r.   r%   )op_descprefixZout_sr   r   r   _pretty_op_desc_   s   "ru   c                 C   s   t | dkrg S g }tj }tjjj}| D ]f}|}	d}
t|tjr'|j	}d}
t|t
r0|d }d}| D ]}||rD||jrDq6||vrJd}q6|r||
rY|d urY|	|| < |j	 }|| ||| |drw|d|d || q|S )Nr   FTrV   )r   r   rd   kOpRoleAttrNameOpRoleBackward
isinstancer   Operatorr"   tupler%   has_varr   r    original_id	append_op	copy_from	_set_attrrf   rb   r!   )descsr   Z
main_blockZin_memory_varsgrad_op_id_to_fwd_opresult_descsop_role_attr_namebackwardr"   Zorigin_descZorigin_is_operatorZ	is_neededr'   new_op_descr   r   r   _add_needed_descs_to_block   s@   





r   c                 C   s   t | dkrg S g }tj }tjjj}| D ]B}t|tjr,|d ur)|||j	
 < |j	}t|tr5|d }|j	 }|| ||| |drS|d|d || q|S )Nr   rV   )r   r   rd   rv   rw   rx   ry   r   rz   r"   r}   r{   r~   r   r   rf   rb   r!   )r   r   r   r   r   r   r"   r   r   r   r   _add_descs_to_block  s(   




r   c                 C   s`   t | jjD ]}t|tjsJ t|jdkr$|jd | jkr$|| _	 nq| j	d u r.t
dd S )Nr   r   z#loss.op is None. Should not happend)reversedr   r   ry   r   rz   r   r%   r'   r(   
ValueError)lossr(   r   r   r   _find_loss_op_,  s   
r   c           	      C   s   |du rd}|du rt | }t| ttfr6t||D ]}| | }t|tr)|d }||| ||| qt| tjr]| 	 D ]\}}t|ttfr\|D ]}||| ||| qMq@dS dS )z
    Traverse all ops in op_descs[begin_idx : end_idx],
    if any op has inputs/outputs named "old_name", rename it as 'new_name'
    Nr   )
r   ry   listr{   r-   _rename_input_rename_outputcollectionsOrderedDictitems)	op_descsZold_namenew_nameZ	begin_idxZend_idxr/   rs   rI   valuer   r   r   _rename_arg_7  s(   
r   c                 C   s   t  }||  t|D ]\}}||ttdd | qt|D ]\}}||ttdd | q%t j	
 }t j	 }||vrLt j	jj||< ||vrTd||< t|D ]\}	}
t|
tjrk||	|
j qY||	|
 qY|S )zS
    Create a C++ OpDesc object with specified inputs, outputs and attributes.
    c                 S      t | tjr
|  S | S r   ry   sixbinary_typedecodeargr   r   r   rH   Z     

z"_create_op_desc_.<locals>.<lambda>c                 S   r   r   r   r   r   r   r   rH   a  r   rT   )r   OpDescset_typer   	iteritemsrh   r   map
set_outputrd   rv   re   rw   rx   ry   r   BlockZset_block_attrr"   r   )Zop_typerY   rZ   r[   rs   paraargsr   rj   r'   valr   r   r   _create_op_desc_O  sH   


r   c                 C   sl   t di dt| jgiddgddd| jdd	tj ttjjj	ttjjj
B tj | jtj i}|S )
Nfill_constantrU   shaper   r         ?rR   rW   F)r   _append_grad_suffix_r'   rR   r   rd   rv   rc   rw   rx   Lossre   r(   rb   )r   rs   r   r   r   _create_loss_op_desc_t  s$   r   c                 C   s   |j t| }t| }|j t|r/|j t|}||  |	|
  dS td|  |tjjj dS )z>
    Infer the data type and shape of given grad variable
    zSSet grad var: {} dtype to default FP32, since we can't find its related forward varN)r"   find_varcptto_bytes_strip_grad_suffix_has_var_recursiveZfind_var_recursiveZ	set_dtyperR   	set_shaper   warningswarnr=   r   r`   ra   ZFP32)grad_var_namer   grad_varZfwd_nameZfwd_varr   r   r   _infer_var_data_type_shape_  s   r   c                 C   s,   t | dkrdS | D ]	}||vr dS q
dS )z8
    Test if all elements of 'cands' are in set 's'
    r   FT)r   )candsscr   r   r   _all_in_set_  s   r   c                 C   s@   t | dkrdS t|}t| }|D ]	}||v r dS qdS )z9
    Test if some elements of 'cands' are in set 's'
    r   FT)r   r   to_text)r   r   Zliteral_setZliteral_candsr   r   r   r   _some_in_set_  s   

r   c                 C   sV   t | } | t }|dkr| d| n| }| d}|dkr)||d d S |S )zq
    Strip the grad suffix from the given variable name
    e.g. x@GRAD ==> x
         y@GRAD@RENAME@1 ==> y
    r2   Ngrad/   )r   r   findr   grad_var_suffixrfind)r'   posr   new_posr   r   r   r     s
   

r   c                 C   s   t | t  S )zM
    Append grad suffix to the given variable name
    e.g. x ==> x@GRAD
    )r   r   r   r   r'   r   r   r   r     s   r   rT   c              	   C   sN   ||  vr
g ||< || tdd||  id| gid|d | g|| < dS )zW
    Use sum op to accumulate_gradients, the gradients are stored in renamed_vars.
    sumXrU   FZ
use_mkldnnrV   N)keysr!   r   )r+   renamed_varspending_sum_opsr6   rV   r   r   r    _accumulate_gradients_by_sum_op_  s   r   c           	   
   C   s   ||  vr
g ||< ||  d }tdt||  D ]6}|}||  | }|t||  d kr6| d t| }n| }|| td|g|gdd|gid|d q| g|| < d	S )
zg
    Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars.
    r   r   z@ADD@Zgrad_add)r   YrU   Fr   N)r   r-   r   rr   r!   r   )	r+   r   r   r6   rV   Zout_namer/   Zx_nameZy_namer   r   r   !_accumulate_gradients_by_add_ops_  s*   r   c              
      s  t  d }t }tt}tt}tt}tt}	t| D ]F\}
}t	j
 }d}||r8||}| D ],dvrCq<t| dkrht| |kr^t|||
|	  q<t|||
|	  q<t| D ]\}}||}t|D ]\}dvrq|t	 ks| v rq|t| dkrg|< |
|< q|t| dkr%d t| d t|   |  d7  < |dur؈|v r| | < n| <  | d< t|  | |
 t|  | d| D ]}||}|v r|| fd	d
|D  q fdd
|d| D ||d  }d t| d t|   |  d7  < |durR|v rN| | < n| <  ||< ||| |   ||	< q|qoq"t|D ]1\}t| dkrt| |krt||t| |	  qot||t| |	  qot| }ttt|  D ]D\}}|}
t|D ]8\}}|
|kr|
d n|
| }|dur|| |  ddur|| |   || < | |
| | qq| S )a  
    In backward part, an variable may be the output of more than one ops.
    And one op may yield its multiple outputs to the same variable.
    In these cases, the variable should be the accumulation of all the outputs.
    `sum_op`s are added to implement the accumulate.

    Args:
        grad_var_to_var(dict): used to build the mapping between grad var name and forward var name.
        Only for auto parallel.
    ZFLAGS_max_inplace_grad_addrT   @GRADr   r   z@RENAME@block@Nc                       g | ]
}|kr
 n|qS r   r   rJ   r   r+   r   r   rL   I      z._addup_repetitive_outputs_.<locals>.<listcomp>c                    r   r   r   rJ   r   r   r   rL   N  r   )r   Z_global_flagsr   r   defaultdictrc   r   rr   rC   r   rd   re   rf   rb   r.   r   r   r   output_namesoutputempty_var_namer   r   r!   r   r   r   r   getr}   rg   )r   	block_idxgrad_var_to_varr   Z_MAX_ADD_NUM_r   Zvar_rename_countr   Zrenamed_var_start_idxZ
var_devicer0   rs   rj   rV   Z	param_idx
param_name	arg_namesZarg_idxpZp_arg_namesrY   Zop_descs_lenrI   r   r/   r(   Z
target_idxr   r   r   _addup_repetitive_outputs_  s   




















B

	r   c           
   	      s   fdd t dd |D  fddD g }tD ]H\}}| D ]?}t |v rf|v rft|}tdd|gid|gid	d
d}	|dur_|| ddur_||  ||	 < |	|	|f q'qt
fddt|D  S )z
    Remove unnecessary grad ops
    A grad op can be removed in two cases:
        1. all outputs of the grad op are in 'no_grad_set'
        2. all grad inputs of the grad op are in 'no_grad_set'
    NOTE: we will skip target_vars's grad name.
    c                    sT   |   }t|dkst||rdS tdd |  D |r(|t|   dS dS )Nr   Tc                 S   s"   g | ]}| t d kr|qS )r2   r   r   r   rK   r'   r   r   r   rL     s
    zH_remove_no_grad_branch_.<locals>._op_can_be_removed_.<locals>.<listcomp>F)r%   r   r   r.   updateset)rs   no_grad_setZout_arg_names)target_grad_var_namesr   r   _op_can_be_removed_  s   z4_remove_no_grad_branch_.<locals>._op_can_be_removed_c                 S   s   g | ]	}|j t  qS r   )r'   r   r   rK   r   r   r   r   rL     s    z+_remove_no_grad_branch_.<locals>.<listcomp>c                    s   g | ]	} |s|qS r   r   rK   rs   )r   r   r   r   rL     s    Zfill_any_liker   rU   r   r2   )r   rR   Nc                    s    g | ]}  |d  |d qS )r   r   )rg   )rK   r   )r   r   r   rL     s     )r   rC   r.   r   r   r   r   r   r}   r!   r   r   )
r   r   r   target_varsZ	to_insertr0   rs   r   Zx_inr   r   )r   r   r   r   r   _remove_no_grad_branch_  s<   

r   c                    s  G  fdddt G fdddt  t fddfdd fd	d
}|du r4t nt|}|D ]}||j  ||j  q:t }t }| D ]&}t| }	|	| | }
||  ||}t|
t|	kr{|| qUg }|D ]N}|g}t|j	}d}|g}t|dkr|
d}t|j	|r|jD ]}||j ||j q||j nd}nt|dks|r|dd |D  qt|}t| }||krt S |S )a  
    Pruning Program with Structural Analysis Method of Computational Graph.
    The nodes of the computational graph composed of backward OPS should be
    interconnected. If there are unconnected sub-graphs in the computational graph,
    these sub-graphs should be cut off.

    Args:
        grad_op_descs(list[core.OpDesc]): The candidate backward OpDescs.
        forward_ops(list[Operator]): The forward ops.
        input_grad_names_set(set): this set is used to store the gradients' name
            which is generated by backward ops, and input_grad_names_set can help
            to prune the unnecessary backward ops.

    Return:
        (set[core.OpDesc]): A set of OpDescs which should be pruned.
    c                       ,   e Zd Zdd Z fddZ fddZdS )z_find_not_need_ops.<locals>.Varc                 S   s   || _ d | _g | _d S r   )r+   gen_oppendding_ops)r   r+   r   r   r   r        
z(_find_not_need_ops.<locals>.Var.__init__c                    s&   t | sJ | jd u sJ || _d S r   )ry   r   )r   r   Opr   r   
set_gen_op  s   
z*_find_not_need_ops.<locals>.Var.set_gen_opc                       t | sJ | j| d S r   )ry   r   r!   )r   r(   r   r   r   add_pending_op     z._find_not_need_ops.<locals>.Var.add_pending_opN)rl   rm   rn   r   r   r   r   r   r   r   Var  s    r   c                       r   )z_find_not_need_ops.<locals>.Opc                 S   s   || _ g | _g | _d S r   )rs   rY   rZ   )r   rs   r   r   r   r     r   z'_find_not_need_ops.<locals>.Op.__init__c                    r   r   )ry   rY   r!   r   r   r   r   r   insert_input  r   z+_find_not_need_ops.<locals>.Op.insert_inputc                    r   r   )ry   rZ   r!   r   r   r   r   insert_output  r   z,_find_not_need_ops.<locals>.Op.insert_outputN)rl   rm   rn   r   r   r   r   r   r   r   r     s    r   c                    s:   |   vr | g| < n	|   |  |  d S Nr2   )r   r!   r   r   var_versionsr   r   _create_node  s   z(_find_not_need_ops.<locals>._create_nodec                    s&   |   vr | g| < |  d S r   )r   r   r   r   r    _create_or_get_last_version_node  s   z<_find_not_need_ops.<locals>._create_or_get_last_version_nodec                    sd    | }|   D ]}|d}|| || q|  D ]}|d}|| || q|S )Nr   )r.   r   r   r%   r   r   )rs   op_noder\   r   r   )r   r   r   r   r   _create_op_node  s   



z+_find_not_need_ops.<locals>._create_op_nodeNTr   Fc                 S      g | ]}|j qS r   )rs   )rK   noder   r   r   rL   2      z&_find_not_need_ops.<locals>.<listcomp>)objectdictr   r   r"   r.   r%   r   addrY   popr   rZ   r$   r   )grad_op_descsZforward_opsinput_grad_names_setr  Zforward_vars_setr(   Zbackward_vars_setZspecial_op_nodesrs   Z	input_setnew_varsr   Znot_need_op_descsZspecial_op_nodeop_listZ
ready_varsZ
remove_opsZcandidate_opsout_varZnot_need_op_descs_setZgrad_op_descs_setr   )r   r   r   r   r   r   _find_not_need_ops  s\   




r  c                 C   s"   |   }tjt|}| S r   )Zserialize_to_stringr   r   Z
FromStringr   r   __str__)rs   Zprotostrprotor   r   r   serialize_op_decs<  s   r  c           1      C   s  dd |D }t t|}| j }	| j }
t| |}|  |  ||}g }t|dkrcd}|d g}|D ]}||j	vrC n|j	| d }|D ]}t
||}qLq:|dkrb|d|d g nAd}d}	 |t|d krqn3||| g||d  g\}}}|r|||}|||d g ntd||d  |d7 }qh|g kr|d d dkrd|d d gg| }n|}t|D ]j\}\}}td	| td
|| j || j  td||d  j ||d  j  td	| td
|| j || j  td||d  j ||d  j  qg }|D ]}|||d |d  q1t|t| }tdt|| ||  ||  t t|}g }i }|| } t|}!tj }"|g kr|d|! }#t|#D ]]}$|$drtdt|$jd t |$jt!"|| j# g \}%}&|dur|%D ]	}'|$||'$ < q|$j|"r|$j%|"}(|%D ]	}'|'&|"|( qt'|%|	|})||) |(|& qt|ddd D ]e\}}||d |! }#|d }!t|#D ]]}$|$drtdt|$jd t |$jt!"|| j# g \}%}&|dur8|%D ]	}'|$||'$ < q.|$j|"rQ|$j%|"}(|%D ]	}'|'&|"|( qGt'|%|	|})||) |(|& q||d |d  }*d| }+|*D ]e}$|$drtdt|$jd g },|,|$j  |,|$j)  |,D ]=}| *|j+s||v rq||v rq||vr||+ ||< | j, *|}-| j-|| |-j.|-j/|-j|-j+|-j0d qqst1|*|
| | |}.t'|*|	|})|D ]}/t2|.|/||/  q||. t|)D ]V}'t |'t!"|| j# g \}%}&|dur$|%D ]}0||'$  ||0$ < q|'|"r;|'%|"}(|%D ]	}0|0&|"|( q1|D ]}/t2|%|/||/  q=||% |(|& qqt3|| j#|d}t4||| j# ||}t'|||})||||fS )a  
    Create grad ops with forward ops, and insert them into given block

    Args:
        block(Block): the block where forward ops are
        ops(Op): the forward operators whose forward recomputation backward ops need to be added
        target_vars(list[Tensor]): the loss vars we want to calculate gradient.
        target_block(Block): the block which is going to hold new generated grad ops
        no_grad_dict(dict):
            key(int) block index
            val(str): corresponding forward variable name
        checkpoints: variables that a user defined as checkpoint for forward recomputation

    Algorithms:
        0) deal with forward recomputing program descs
        1) find ops between checkpoints, i.e. recompute_segments
        2) go through all forward ops and induct all variables that will be hold in memory
            a. variables that are used across segments will be held in memory
            b. output of dropout op will be held in memory
            c. input variables will be held in memory
        3) go through each recompute_segments, add backward ops with forward recomputation
            a. add ops in current recompute_segment as forward recomputation ops
            b. rename all non-checkpoint variables in recomputation ops
            c. add backward ops of current recomputation ops
            d. add sum op for repetitive_outputs
        4) remove no grad branch as it is in _remove_no_grad_branch_
        5) Note1: all appended ops' OpRole are Backward
        6) Note2: all variables with new name should be returned so that _append_backward_vars_ can be called
        7) Note3: current forward recomputation backpropagation does not handle programs with subblock
    c                 S   r  r   r   rJ   r   r   r   rL   i  r  z:_append_backward_ops_with_checkpoints_.<locals>.<listcomp>r   r2   r   r   Tz)Could not recompute op range [{}] - [{}] zrecompute segment[{}]zsegment start op: [{}]: [{}]zsegment end op: [{}]: [{}]zifound [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars	sub_blockz7Recompute don't support ops with sub_blockinvoke op: %sZwith_sub_blockNz.subprog_%d)r'   r   rR   r#   r    rS   r   )5r   r   program_create_blockr   rk   rE   rO   r   r   r4   r!   r7   r@   r;   r<   r=   rC   r"   r#   r.   r$   r1   r,   r)   r   rd   re   r   rf   	Exceptionru   get_grad_op_descr   r   r0   r}   rb   r   r   r   r%   r   r    global_blockr_   r   rR   rS   r   r   r   r   )1r   r   r   target_blockno_grad_dictgrad_to_varcheckpointsr   rN   Zlocal_blockZbuffer_blockprogram_statsegmentsr5   Z	var_groupr'   r6   r0   Z	start_idxr?   flagr>   Zmax_idxrecompute_segmentsr/   Zidx1Zidx2vars_should_be_holdsegmentZ
cross_varsr	  Zvar_name_dictZvars_in_memoryZmax_calculated_op_positiondevice_attr_nameZgap_opsr(   grad_op_descop_grad_to_varrs   rV   Zadded_descsZff_opsZ
var_suffixZinput_and_output_namesZref_varZbuffer_descsrI   Z	g_op_descr   r   r   &_append_backward_ops_with_checkpoints_B  st  '






  
















r&  c                    s   | dr j|dksJ t|ttdfsJ |du r!|j}|jdv r] fdd|D }|D ]} jD ]}||jv rK|jD ]
}|	 
| q@q6q1|jdv }	t |g |||	}
|
S  jS )a  
    Get output vars in subblock which will be assigned to parent block.
    It is used to find the grad path in subblock.

    Args:
        sub_block(Block): The sub-block in which to get op path.
        sub_block_op_desc: The op desc of the sub-block op such as 'while', 'conditional_block' and 'recurrent'.
        no_grad_set(set): The set of no grad var name. no_grad_set will be changed.
        op_path_dict(dict): op_path_dict will be changed.
            key(int) block index
            val(list) the op path of block(index)
        sub_block_target_names(set): Target var names of sub-block.
    Return:
        The forward op path of sub-block corresponding to backward op.
    r  N)Zconditional_blockwhilec                    s   g | ]}  |qS r   )_var_recursiver   r  r   r   rL   b  s    
z'_get_sub_block_path.<locals>.<listcomp>)r'  )rf   r0   _block_attr_idry   r   r#   r%   r   r.   r!   r(  _find_op_path_)r  Zsub_block_op_descr   op_path_dictsub_block_target_namesZsub_outputsr   rs   r'   is_whileZsub_block_op_pathr   r)  r   _get_sub_block_pathB  s4   






r/  c                 C   sB   t j}t jjj}| | jv rt|  |  t|krdS dS )NTF)	r   rd   rw   rx   kOpRoleVarAttrNameZ
attr_namesrc   Z	all_attrsrv   )r(   Zop_makerr   r   r   r   _is_grad_op_s  s   
r1  c                 C   s   d| |  S )Nr   r   )r'   Z
grad_orderr   r   r   _rename_grad_name_|     r2  c           #         s    fdd}|dur"t |ttfsJ |D ]}t|ds!tdqg }| j}|
du r-i }
t |
ts4J t|D ]fg }dr||	
d}| }||j t}d|
d }t|||||||||d
 ||  ||j tjt|| j |\ }|dur D ]}|| < q|	dur||	||j nttjjjdd}|dur|j}	||	||j tj  }j|rڈj!|} D ]}|"|| q|jd	krI D ]f}j# }|# D ]}||
v r||vr|$||
|  q|% D ]C}d
|vrq| j&|'drFt(||j}|)|| ||
|< ||v rF|	dur;|| |	j*|j |< || ||< |+| qqdurfddd} D ]3}fdd|# D }t,|dkrpd} nt-|r|| d}|% D ]}.| qqX|r|/| q8|0  |/| q8d}|	dur|	j*|j }t1|| j||d}t2||| j ||}t3||fdd|D }tj4 } tjj5j6}!|D ]2}|j7 }"|"8| |""| |! |"|d< |durt |ttfs J |D ]	}|||d qqdS )a  
    Create all grad ops, and insert them into given block

    Args:
        block(Block): the block where forward ops are
        ops(Op): the forward operators whose backward ops need to be added
        target_vars(list[Tensor]): the loss vars we want to calculate gradient.
        target_block(Block): the block which is going to hold new generated grad ops
        no_grad_dict(dict):
            key(int)  block index
            val(set) a set of variable names. These variables have no gradient
        grad_to_var(dict)(output argument):
            key(str): grad variable name
            val(str): corresponding forward variable name
        callbacks(callable object): a callable object used to decorate new generated grad ops
        input_grad_names_set(set): this set is used to store the gradients' name which is
            generated by backward ops, and input_grad_names_set can help to prune the unnecessary
            backward ops.
        op_path_dict(dict): op_path_dict will be changed.
            key(int) block index
            val(list) the op path of block(index)
        rename_var_map(dict): used to associate target_grad var name with first grad_op input name.
            Only used in for high order gradient.
    c                    sD   | j | |  D ]}| | jvsJ j | j| < q
d S r   )r   r   r}   Zgrad_op_id_to_op_idr"   )distop_contextr%  Zappending_grad_timesrs   )r$  r(   r   r   update_distop_context  s   
z4_append_backward_ops_.<locals>.update_distop_contextN__call__z%'callback' must be a callable object.r  r  Z_g_default_distributed_contextr   r   asciic                    s   |  t dkp|  v S r   r   r   )r
  r   r   rH     s    z'_append_backward_ops_.<locals>.<lambda>Fc                    s   g | ]} |r|qS r   r   r   )is_grad_namer   r   rL     s    z)_append_backward_ops_.<locals>.<listcomp>r   Tc                    s   g | ]}| vr|qS r   r   r   )not_need_opsr   r   rL   H  s    Z__current_op_desc__)r   context)9ry   r   r{   hasattrr   r  r  r   rf   r   r*  r  _set_forward_block_idxr0   copy_append_backward_ops_Z	_rollbackr!   r"   r   r  r   r   r}   _appending_grad_timesgetattrpaddledistributedZauto_parallelZdist_contextZdist_op_contextrd   re   rb   r   r.   r   r%   r   encoder2  r   r   r  r   r   r  r   r$   r   r   r  rv   rw   rx   r~   r   )#r   r   r   r  r  r  	callbacksr
  r,  r4  rename_var_mapr   r5  cbr	  r  Zgrad_sub_block_listr  Zgrad_sub_blockZpre_input_grad_names_setsub_block_pathr%  rs   Zdefault_ctxr#  rV   Zforward_op_inputsr'   r   Zis_append_gradZinput_grad_namesr   r   r   r   r   )r$  r
  r8  r9  r(   r   r>    s  &
























r>  c                 C   s   t  | v S r   )r   r   )r+   r   r   r   _is_grad_var_Z  r3  rH  c                 C   s   | j }|dkr	d S | j}tj|jD ]*}||j}tj| D ]}|	|}|
dr<|d|kr<|    S q#qd S )Nr   r  )r0   r  r   movesr-   Z
num_blocksr   r"   op_sizer(   rf   r*  )r  sub_block_idr  Zblock_idZ
block_descr6   r(   r   r   r   _find_parent_op__  s"   
rL  c                    s  g }	 t  }g |dur#| }| }|D ]}||v r"| qt| j D ]}	 j|	}
|
drH j	
|
d}t|d|| dd |
 D }dd |
 D }dd |
 D }dd |
 D }|st||	 q+	 |r fd	d|D }|s	 |
 d
vr||	 q+t }|
 D ]-} jt|s|t krq jt| || ||vrq| f||| < q|
  |
 j |
 j |
 D ]}||v rt|  qq+t|D ]}	 j|	|	d  qdS )a  
    Create new variables required by backward pass.

    Args:
        block(Block): the block where new variables will be created
        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
        grad_to_var(dict):
            key(str): grad variable name
            val(str): corresponding forward variable name
            In most cases, this dict is generated by _append_backward_ops_()
        grad_info_map(dict)(output argument):
            key(str): forward variable name
            val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
    Nr  r   c                 S      g | ]}t |r|qS r   rH  r   r   r   r   rL     
    z*_append_backward_vars_.<locals>.<listcomp>c                 S   rM  r   rN  r   r   r   r   rL     rO  c                 S      g | ]
}|t  kr|qS r   r   r   r   r   r   r   rL     
    c                 S   rP  r   rQ  r   r   r   r   rL     rR  c                    s*   g | ]} j t|s|v r|qS r   )r"   r   r   r   r   r   Zparent_op_varsr   r   rL     s    )Zrnn_memory_helper_gradr   )rL  r.   r%   r!   r-   r"   rJ  r(   rf   r  r   r*  _append_backward_vars_r#   r   r   r   r   r   r   r   r  Zcheck_attrsZinfer_var_typeZinfer_shaper   r   Z
_remove_op)r   start_op_idxr  grad_info_mapZops_to_removeZ	parent_opZ
input_argsZoutput_argsZin_argr6   rs   r  Zgrad_var_insZgrad_var_outsrY   rZ   Zexisting_grad_var_insr  r   r   r   rS  r   rT  s  s   






rT  c                 C   s   t  |}t|| j D ]?}| j|}| D ]}||v r'||||  q| D ]}d|vr3q,| j|	drKt
|}||| |||< q,qt|D ]\}	}
|	|v re||	 ||
< ||	 qRd S )Nr   r7  )r=  r-   r"   rJ  r(   r.   r   r%   r   rC  r
   r]   r   r   r   r  )r   rU  r  target_grad_mapZvar_mapr6   rs   r'   r   gngr   r   r   _rename_grad_  s,   


rZ  c                 C   s   t  }t| tjsJ | jD ]/}t|tjsJ t }t|j	 D ]}t|tj
s,J |jr7|t|j q"|||j< q|S r   )r  ry   r   Programblocksr   r   r   varsvaluesVariablerS   r  r   r'   r0   )r  r  r   block_no_grad_setr   r   r   r   _get_stop_gradients_  s   
ra  c                 C   s4   t  }|dkr| |j}|||< |}|dks|S )Nr   )r   r   r   
parent_idx)r  current_block_idxson_parent_block_idx_dictZparent_block_idxr   r   r   _get_son_parent_block_idx_dict  s   re  c                 C   s   t  }| d urCt| t ttfr:t| D ]$\}}t|tjr$||j qt|t	j
r0|| qtdt| |S tdt| |S )NzWThe type of no_grad_set's member must be paddle.fluid.Variable or str, but received %s.zGThe type of no_grad_set should be set or list or tuple, but received {})r   ry   r   r{   rC   r   r_  r  r'   r   string_types	TypeErrorr#   r=   )r   Zno_grad_set_namer/   no_grad_varr   r   r   _get_no_grad_set_name  s&   
ri  c           1      C   s  i }t | dtjd | jdu rt|  | jtj t	tjj
jt	tjj
jB  |dur6t |dttfd | jj}|d}|j}	||	}
|	dk}|sT| jd7  _|du r\t }ntt|}t|}|d ttt| |r|j|
jd}||	 n|}t||	}i }|D ]}||j ||< qt  }t!| }| j||" < |j# $| |D ]}}||}ttt%|| }t  }t&|| gg ||}t'||| g|}|| || ttt| d}|s|jdkrtt| j(g}d}|dkrt)|trt*|dkrd	}t+||| g|||||\}}}}qt,||| g||||||||d
 qt  }|s:||	 nd} t-|| |i  t.|| || |	|_|/  |j0D ]}!|1|!j" ddurm||!j"  }"|"j2|!_2qT|durt |dtttfd g }#t3|D ]-\}$}%t |%d|$ tjt4j5fd t)|%tjr|#6|%j( qt)|%t4j5r|#6|% qn|7 8 }&dd |&D }#g }'tj9 }(|#D ]\}%t:;|%|vrՐq||% })|)d }*|*<|)d st=d>|)d |)d |7 ?|%}+|*?|)d },|s| j<|)d r|'6|+|,f q|'6|+df q|'6|+|,f q|'D ][\}-}.|.du r3q(|r9|*j0n|7 j0}/t@|/D ]}!t)|!tjAsMJ |.j(|!jBv rY|!|._ nqB|.jdu ret=d|-j(|.j(g}0|.jC|(r{|0D|.jE|( |.j|(|0 q(|r|'|fS |'S )a	  
    :api_attr: Static Graph

    This function appends backward part to main_program.

    A complete neural network training is made up of forward and backward
    propagation. However, when we configure a network, we only need to
    specify its forward part. This function uses the chain rule to automatically
    generate the backward part according to the forward part.

    In most cases, users do not need to invoke this function manually.
    It will be automatically invoked by the optimizer's `minimize` function.

    Parameters:
        loss(Tensor): The loss Tensor of the network.
        parameter_list(list[Tensor|str]|tuple[Tensor|str], optional): List/Tuple of Parameters or Parameter.names
                                           that need to be updated by optimizers.
                                           If it is None, all parameters
                                           will be updated.
                                           Default: None.
        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
                               should be ignored. All Tensors with
                               `stop_gradient=True` from all blocks will
                               be automatically added into this set.
                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                               Default: None.
        callbacks(list[callable object]|tuple[callable object], optional): List/Tuple of callback functions.
                                               The callbacks are used for
                                               doing some custom jobs during
                                               backward part building. All
                                               callable objects in it will
                                               be invoked once each time a
                                               new gradient operator is added
                                               into the program. The callable
                                               object must have two input
                                               parameters: ``block`` and ``context`` .
                                               The ``block`` is the :ref:`api_guide_Block_en` which
                                               the new gradient operator will
                                               be added to. The ``context`` is a
                                               map, whose keys are gradient
                                               Tensor names and values are
                                               corresponding original :ref:`api_guide_tensor_en` .
                                               In addition to this, the ``context``
                                               has another special key-value pair:
                                               the key is string ``__current_op_desc__``
                                               and the value is the op_desc of the
                                               gradient operator who has just
                                               triggered the callable object.
                                               Default: None.

    Returns:
        list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
        The key is the parameter and the value is gradient Tensor.

    Raises:
        AssertionError: If ``loss`` is not an instance of Tensor.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.nn.functional as F

            paddle.enable_static()

            x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
            y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
            x_emb = paddle.static.nn.embedding(x, size=[100, 256])
            y_predict = paddle.static.nn.fc(x=x_emb, size=1, activation=None, name='my_fc')
            loss = F.square_error_cost(input=y_predict, label=y)
            avg_loss = paddle.mean(loss)

            # Get all weights in main_program, not include bias.
            all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
            all_weights_name = [w.name for w in all_weights]

            # return all param_grads needed to be updated if parameter_list set default None.
            p_g_list1 = paddle.static.append_backward(loss=avg_loss)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]

            # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
            p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]

            # parameter_list can be list of param.name (str).
            p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]

            # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
            p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
            # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]

            # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
            p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]

            # return [] because all param_grads are filtered by no_grad_set.
            p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))

    r   zpaddle.static.append_backwardNrD  r   r   )rb  FT)r
  r,  r4  r   parameter_listzfluid.backward.append_backwardzparameter_list[%s]c                 S   s   g | ]}|j r|jqS r   )Z	trainabler'   )rK   paramr   r   r   rL   0  s    z#append_backward.<locals>.<listcomp>z)grad block[{0}] did not have grad var {1}zUnexpected branch)Fr   r   r_  r(   r   r   r   rd   rv   rc   rw   Forwardr   r   r{   r   r  rc  r?  r   ri  r=  ra  r   r   r   r  rb  r<  re  r"   rJ  r  r   r}   r~   r   r   r+  _find_no_grad_varsr'   ry   r   r&  r>  rZ  rT  ri   r   r   Z_cuda_graph_attrrC   r   rf  r!   r  Zall_parametersr0  r   r   r|   r   r=   r   r   rz   r%   rf   r$   rb   )1r   rj  r   rD  r  r4  r   r  Z
root_blockrc  Zcurrent_blockZis_in_control_flowr  Ztarget_grad_blockrd  Zblock_fwd_op_num_dictr0   r  rs   r   r   r`  r,  op_pathno_grad_varsr
  Zis_recomputer  Zcheckpoint_namesr!  r   rV  
fwd_op_numr(   Zfwd_op
parametersr/   rk  paramsZparams_and_gradsZop_role_var_attr_name	grad_info
grad_blockZ	param_varr   r   rX  r   Zattr_valr   r   r   r      sV  k













	

c                 C   s$   | d u rg S t | trt| S | gS r   )ry   r   r   rF   r   r   r   _as_list_  s   ru  c                 C   s>   |j }| j}|j}|dkr||krdS ||j}|dksdS )Nr2   TF)r  r0   rb  r   )Zancestor_blockr   progZancestor_idxrb  r   r   r   _is_ancestor_blocke  s   rw  c           	      C   s   |r|d j n| }tdd |D }|j| jkst|| r|S | j}|j| jkrv|jdks/J | |j}t }t|jD ].}t|j	
 |rk|j	 D ]}|| |j	t|sj|j	t|rj|| qLq=|}|}|j| jks(|S )ad  
    In `cur_block`, get output names those linked to targets.
    NOTE:
    1. `targets` can be in `cur_block`;
    Usually, `targets` is in `cur_block`. However, considering control flow,
    2. `targets` may be in sub-block but `cur_block` is an ancestor of `targets[0].block`;
    3. `targets` may be in the block which is ancestor of `cur_block`.
    r   c                 S   r  r   r   )rK   outr   r   r   rL   }  r  z%_get_output_names.<locals>.<listcomp>r2   )r   r   r0   rw  r  rb  r   r   r   r"   r%   r.   r  r   r   r   )	Z	cur_blocktargetsr   Zcurrent_output_namesrv  Zparent_blockZparent_block_output_namesr(   r'   r   r   r   _get_output_namesr  s,   


rz  c           
      C   s   t | |}g }ttt|D ]8\}}|ds6|j D ]}||vr5||j vr5| j| j	s5|
| q|j D ]}	|	|vrF||	 q;qt|S )zc
    Find the vars which is not used in the program, and
    those vars belong to no_grad_var.
    r  )rz  r   r   rC   rf   r"   r%   r.   r]  rS   r!   r  r   )
r   rn  ry  r   r   rh  r/   r(   r  r'   r   r   r   rm    s    




rm  Fc                    s  t dd |D }t |}|du rt }dgt j |rMt jD ](\}}	t|	j |rHt	
|	jrH|	j D ]}
|
|vrF||
 q;q$d|< q$ttt jD ]L\}}	|	dr~|	d} j|}|t |	j@ }t||	t  ||}|||< t|	j |rt	
|	jr|	j D ]}
|
|vr||
 qqVd|< qV|rttt jD ]\}}	| dkrt|	j |rd|< q fddtt jD }|r|D ]}	|	j D ]}
|
|vr j|
 jr||
 qq|S )ar  
    It is used to find the grad path in `block`.

    Args:
        block(Block): The block in which to get op path.
        targets(list[Variable]): The target variables.
        inputs(list[Variable]): The input variables.
        no_grad_set(set): The set of no grad var name. no_grad_set will be changed.
        op_path_dict(dict): op_path_dict will be changed. op_path_dict will be changed.
            key(int) block index
            val(list) the op path of block(index)
        is_while(bool): Whether or not `block` is while block
    Return:
        The forward op path of block corresponding to backward op.
    c                 S   r  r   r   )rK   inpr   r   r   rL     r  z"_find_op_path_.<locals>.<listcomp>NTFr  c                    s   g | ]}| r j | qS r   )r   )rK   r/   r   Zrelevant_op_flagsr   r   rL     s
    )r   rz  r  r   r   rC   r   r"   r.   r   Zhas_non_empty_grad_op_makerr#   r%   r  r   r   rf   r*  r  r   r/  r-   r]  rS   )r   ry  rY   r   r,  r.  r&   r   r/   r(   r'   rK  r  r-  rG  rn  r   r|  r   r+    sz   










r+  c                 C   s  t | } t |}t |}| d j}|j}| jd7  _|j}|s'dgt|  }t| t|kr3td|du r;t }ntt		|}t
|}|d ttt| |j }t }	i }
i }t|D ]\}}| | }t|j}|du r|jd }|j tdd|jgid|gii  |	| td	d
|gid|gi|jd|jd}|j | |	| qb|jj|ks|jj|krtd|j|jkrtd|j|jf |j|
t|j< |	|j |j||< qb|jdkrd}	i }|D ]
}|jj|krdqttt|d }t }t|| |||}t||| |}|| |d ttt| t }t }t||| ||||	||d	 t||||
 t|||| |   g }|D ]%}|j|vr`|!d qQ||j }|d }|"|d }|!| qQt|dkr|d S |S )aR  
    Backpropagate the gradients of targets to inputs.

    Args:
        targets(Tensor|list[Tensor]|tuple[Tensor]): The target Tensors
        inputs(Tensor|list[Tensor]|tuple[Tensor]): The input Tensors
        target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensors
            of targets which has the same shape with targets, If None, ones will
            be created for them.
        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
                               should be ignored. All Tensors with
                               `stop_gradient=True` from all blocks will
                               be automatically added into this set.
                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                               Default: None.

    Return:
        (list[Tensor]): A list of gradients for inputs
        If an input does not affect targets, the corresponding gradient Tensor
        will be None
    r   r   Nz:Should have the same number of target_gradients as targets_shaper   ZInputrU   r   ZShapeTensorr   )r   r   rR   z%all targets must be in the same blockz2The shapes of target and grad are different: %s %sz,input must be in the same program as targets)r
  r,  rE  )#ru  r   r  r?  r0   r   r   r   ri  r=  ra  r   r   r   r   r"   rJ  rC   r'   r~   r   r   r  r   rR   r   r  r+  rm  r>  rZ  rT  ri   r!   r   )ry  rY   target_gradientsr   r   rv  r   r  rp  r
  rW  rE  r/   gradtargetZ	grad_nameZtarget_shapers   r\   r`  r,  rn  ro  r  rV  Z	grad_varsZ	input_varrs  rt  r   r   r   r   calc_gradient  s   











r  c                 C   s^   t | dtjttfd t |dtjttfd t |dtjtttdfd t| |||}t|S )a  

    Backpropagate the gradients of targets to inputs.

    Args:
        targets (Tensor|list[Tensor]|tuple[Tensor]): The target Tensors.
        inputs (Tensor|list[Tensor]|tuple[Tensor]): The input Tensors.
        target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensor
            of targets which has the same shape with targets, If None, ones will
            be created for them.
        no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
            should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
            be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
            in this set will be added to the default set. Default: None.

    Return:
        (list[Tensor]): A list of gradients for inputs
        If an input does not affect targets, the corresponding gradient Tensor
        will be None.

    Examples:
    
        .. code-block:: python
          :name: code-example
            import paddle
            import paddle.nn.functional as F

            paddle.enable_static()

            x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
            x.stop_gradient=False
            y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
            y = F.relu(y)
            z = paddle.static.gradients([y], x)
            print(z) # [var x@GRAD : LOD_TENSOR.shape(-1, 2, 8, 8).dtype(float32).stop_gradient(False)]
    ry  zpaddle.static.gradientsrY   r~  N)r   r   r_  r   r{   r#   r  ru  )ry  rY   r~  r   Zoutsr   r   r   r     s   &c                 C   s  t | dtjjd t |dtjjd |du s|du r^t }t }| jD ]$}|jD ]}|j	D ]
}|
|j|  q-|jD ]
}|
|j|  q;q(q#|du rSt||}|du r^t||}t||}	t| d dd t||	D }
||
}W d   ||
fS 1 sw   Y  ||
fS )aW  
    :api_attr: Static Graph

    Backpropagate the gradients of the program and apply the gradients with the given optimizer.

    Args:
        program (Program): The input program.
        optimizer (Optimizer): The optimizer to apply the gradients.
        inputs (Tensor|list[Tensor]|tuple[Tensor], optional): The input Tensors.
            If None, the inputs will be created from the input variables in the given program. Default:None.
        outputs (Tensor|list[Tensor]|tuple[Tensor], optional): The output Tensors.
            If None, the outputs will be created from the output variables in the given program. Default: None.

    Return:
        tuple: tuple (optimize_ops, params_grads), A list of operators appended
            by gradients_with_optimizer and a list of (param, grad) variable pairs, param is
            ``Parameter``, grad is the gradient value corresponding to the parameter.
            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
            indicate program pruning. If so, the program will be pruned by ``feed`` and
            ``fetch_list`` before run, see details in ``Executor``.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.static as static

            paddle.enable_static()

            img = static.data(name='image', shape=[None, 784])
            pred = static.nn.fc(x=img, size=10, activation='relu')
            loss = paddle.mean(pred)
            opt_ops, pram_grads = paddle.fluid.backward.gradients_with_optimizer(static.default_main_program(), opt)
            print(opt_ops)

    r  z&paddle.static.gradients_with_optimizer	optimizerNc                 S   s0   g | ]\}}t |tjjjr|d ur||fqS r   )ry   rA  fluidr   	Parameter)rK   Zpramr  r   r   r   rL     s    z,gradients_with_optimizer.<locals>.<listcomp>)r   rA  r  r[  r  Z	Optimizerr   r\  r   r.   r  r]  r%   r   
differencer   r   zipZapply_gradients)r  r  rY   rZ   Zin_setZout_setr   r(   r'   ZgradsZ
pram_gradsZoptimize_opsr   r   r   gradients_with_optimizer  s:   &





r  r   )NN)rT   )NNNNNN)NNNNN)NF)F
__future__r   r  r   Zpaddle.fluidr   r   rT   r   r   r=  r   loggingr	   r   r
   r   rA  Zdata_feederr   r   collections.abcr   __all__Z
get_loggerrl   INFOr;   r  r   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r&  r/  r1  r2  r>  rH  rL  rT  rZ  ra  re  ri  Zstatic_onlyr   ru  rw  rz  rm  r+  r  r   r  r   r   r   r   <module>   s    ;

%
%


 
9~
  
1	

 [j  @&

Q 0