o
    MeZ                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlZddlmZ g Zd	d
 Zdd Zejdd ZG dd deZG dd deZdd Zdd ZdS )    N)core)PyLayer)LegacyPyLayer)	framework)in_dygraph_mode   )loggerc                 C   sP   g }| D ]}t |tjjtjfs|| q| }|j|_|| qt|S N)	
isinstancer   eagerTensorVarBaseappenddetachstop_gradienttuple)inputsoutinpx r   \D:\Projects\ConvertPro\env\Lib\site-packages\paddle/distributed/fleet/recompute/recompute.pydetach_variable   s   
r   c                 C   s$   t dd | D std d S d S )Nc                 s   s.    | ]}t |tjjtjfr|jd kV  qdS )FN)r
   r   r   r   paddler   ).0Zinput_r   r   r   	<genexpr>,   s    
z,check_recompute_necessary.<locals>.<genexpr>z[Recompute]: None of the inputs to current recompute block need grad, therefore there is NO need to recompute this block in backward !)anyr   warning)r   r   r   r   check_recompute_necessary+   s   r   c              	   c   sr    ddl m} t }|  }t|  | | zd V  W t| | | d S t| | | w )Nr   get_rng_state_tracker)=paddle.distributed.fleet.meta_parallel.parallel_layers.randomr    r   get_cuda_rng_stateget_states_trackerZset_cuda_rng_stateZset_states_tracker)Z	rng_statetrackerr    Zorig_cuda_rng_stateZorig_cuda_rng_trackerr   r   r   swith_rng_state_tracker7   s   



r%   c                   @   $   e Zd Zedd Zedd ZdS )LegacyRecomputeFunctionc                 G   s  ddl m} || _|| _g | _g | _g }t|D ]!\}}t|r3|	| | j	| | j	d  q| j	| q| j
|  | jr\t }d|vrQtd|t | _|  | _t }	|	jtjjkridnd| _|	jtjjkrwd| _n|	jtjjtjjfv rd| _ntd	|	j|	jd
krd
| _n|	jdv rd| _ntd|	j|	 \| _ | _!t"  || }
W d    |
S 1 sw   Y  |
S Nr   r   zgpu:z>Recompute with RNG perserve is not support current device: {}.FTO2O1zunsupported amp level: {}Zfloat16)bfloat16Zfloat32r+   zunsupported amp dtype: {})#r!   r    run_functionpreserve_rng_stater   tensor_indices	enumerater   	is_tensorr   save_for_backward
get_deviceRuntimeErrorformatr"   fw_cuda_rng_stater#   fwd_cuda_rng_state_trackerr   _dygraph_tracer
_amp_levelr   AmpLevelO0is_fw_autocastr)   	amp_levelr*   
ValueError
_amp_dtype	amp_dtype_get_amp_op_listamp_white_listamp_black_listno_grad)ctxr,   r-   argsr    tensor_inputsiarg
cur_devicetraceroutputsr   r   r   forwardJ   s`   










zLegacyRecomputeFunction.forwardc              
   G   sB  ddl m} tjj 	 t| j}| j}| 	 }t
|D ]
\}}|| ||< qt }d|_| jrvt| j| j3 tjj| j| j| j| j| jd tt|}	| j|	 }
W d    n1 saw   Y  W d    n1 spw   Y  n+tjj| j| j| j| j| jd tt|}	| j|	 }
W d    n1 sw   Y  t|
tjr|
f}
t|
t|ksJ g }g }t t|
D ]}t|
| tjr|
| j!s|"|
|  |"||  qt|dkrt#dtjjdd tj$%|| W d    n1 sw   Y  tdd	 |	D }|W  d    S 1 sw   Y  d S )
Nr   r   TenableZcustom_white_listZcustom_black_listlevelZdtypeHnone of output has requires_grad=True, this recompute() is not necessaryFrN   c                 s   s$    | ]}t |tjr| V  qd S r	   )r
   r   r   
_grad_ivarr   r   r   r   r   r      s    

z3LegacyRecomputeFunction.backward.<locals>.<genexpr>)&r!   r    r   fluiddygraphguardlistr   r.   saved_tensorr/   r   r7   	_has_gradr-   r%   r5   r6   amp	auto_castr;   rA   rB   r<   r?   r   r   r,   r
   r   r   lenranger   r   r3   autogradbackwardrD   rE   r    r   r.   ZtensorsrG   idxrJ   Zdetached_inputsrK   Zforward_outputs_with_gradZbackward_inputs_with_gradZgradsr   r   r   r_      s~   

&z LegacyRecomputeFunction.backwardN__name__
__module____qualname__staticmethodrL   r_   r   r   r   r   r'   I   s
    
Fr'   c                   @   r&   )RecomputeFunctionc                 O   s  ddl m} || _|| _|| _g | _g | _g }t|D ]!\}}t	|r6|
| | j
| | j
d  q| j
| q| j|  | jr_t }	d|	vrTtd|	t | _|  | _t }
|
jtjjkrldnd| _|
jtjjkrzd| _n|
jtjjtjjfv rd| _ntd	|
j|
jd
krd
| _n|
jdv rd| _ntd|
j|
  \| _!| _"t#  ||i |}W d    |S 1 sw   Y  |S r(   )$r!   r    r,   r-   kwargsr   r.   r/   r   r0   r   r1   r2   r3   r4   r"   r5   r#   r6   r   r7   r8   r   r9   r:   r;   r)   r<   r*   r=   r>   r?   r@   rA   rB   rC   )rD   r,   r-   rE   rh   r    rF   rG   rH   rI   rJ   rK   r   r   r   rL      sb   









zRecomputeFunction.forwardc              
   G   s  ddl m} tjj ( t| j}| j}| 	 }t
|D ]
\}}|| ||< qt }d|_| jrzt| j| j7 tjj| j| j| j| j| jd tt|}	| j|	i | j}
W d    n1 sew   Y  W d    n1 stw   Y  n/tjj| j| j| j| j| jd tt|}	| j|	i | j}
W d    n1 sw   Y  t|
tjtj j!fr|
f}
t"|
t"|ksJ g }g }t#t"|
D ]!}t|
| tjtj j!fr|
| j$s|%|
|  |%||  qt"|dkrt&dtjjdd tj'(|| W d    n	1 sw   Y  t) r#tdd	 |	D }n	td
d	 |	D }|W  d    S 1 s9w   Y  d S )Nr   r   TrM   rP   FrQ   c                 s   ,    | ]}t |tjtjjfr| V  qd S r	   r
   r   r   r   r   rR   rS   r   r   r   r   }      
z-RecomputeFunction.backward.<locals>.<genexpr>c                 s   ri   r	   rj   rS   r   r   r   r     rk   )*r!   r    r   rT   rU   rV   rW   r   r.   rX   r/   r   r7   rY   r-   r%   r5   r6   rZ   r[   r;   rA   rB   r<   r?   r   r   r,   rh   r
   r   r   r   r   r\   r]   r   r   r3   r^   r_   r   r`   r   r   r   r_   .  s   


&zRecomputeFunction.backwardNrb   r   r   r   r   rg      s
    
Grg   c                 O   s8   | dd}t jrt| tj| |g|R i |S )a  
    recompute intermediate activations to save then memory.

    Parameters:
        function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
              whose intermediate activations will be released to save memory in forward stage and will be recomputed
              in backward stage for gradient calculation.
        *args(Tensor): inputs to the function.
        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
              indicate whether to save the forward rng. If it is True, then the last forward rng value will be
              restored when the forward recalculation of backpropagation is performed. The default
              preserve_rng_state is True.

    Returns:
        Output of function on args.

    Examples:
        .. code-block:: python

            import paddle
            from paddle.distributed.fleet.utils import recompute
            import random
            # required: gpu
            def get_fc_block(block_idx, input_size, is_last=False):
                block_name = "block_" + str(block_idx)
                block = paddle.nn.Sequential(
                    (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
                    (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
                    (block_name + "_relu_1", paddle.nn.ReLU()),
                    (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
                    (block_name + "_relu_2", paddle.nn.ReLU()),
                )
                if is_last:
                    block.add_sublayer(
                        block_name + "_fc_2",
                        paddle.nn.Linear(
                            input_size, 1, bias_attr=False
                        )
                    )
                else:
                    block.add_sublayer(
                        block_name + "_fc_2",
                        paddle.nn.Linear(input_size, input_size, bias_attr=False)
                    )
                return block
            class Naive_fc_net(paddle.nn.Layer):
                def __init__(self, input_size=10,
                            recompute_blocks=[1, 3],
                            recompute_kwargs={}):
                    super(Naive_fc_net, self).__init__()
                    self.recompute_blocks = recompute_blocks
                    self.recompute_kwargs = recompute_kwargs
                    self.runfunc0 = get_fc_block(0, input_size, is_last=False)
                    self.runfunc1 = get_fc_block(1, input_size, is_last=False)
                    self.runfunc2 = get_fc_block(2, input_size, is_last=False)
                    self.runfunc3 = get_fc_block(3, input_size, is_last=False)
                    self.runfunc4 = get_fc_block(4, input_size, is_last=True)
                    self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
                def forward(self, inputs):
                    nums = len(self.total_func)
                    for i in range(nums):
                        if i in self.recompute_blocks:
                            inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
                        else:
                            inputs = self.total_func[i](inputs)
                    return inputs
            def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
                gen = paddle.seed(10)
                gen.manual_seed(10)
                random.seed(10)
                if cuda_state:
                    paddle.set_cuda_rng_state(cuda_state)
                batch_size, input_size = 1, 10
                model = Naive_fc_net(
                    input_size,
                    recompute_blocks=recompute_block,
                    recompute_kwargs=recompute_kwargs)
                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                loss_ = []
                param_ = []
                grad_ = []
                for _ in range(5):
                    x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
                    y_pred = model(x)
                    loss = y_pred.mean()
                    loss_.append(loss.item())
                    loss.backward()
                    optimizer.step()
                    param_.append(model.parameters()[9])
                    grad_.append(model.parameters()[3]._grad_ivar())
                    optimizer.clear_grad()
                return loss_, param_, grad_
            cuda_state = paddle.get_cuda_rng_state()
            # without recompute
            loss_ref, param_ref, grad_ref = run_model(
                cuda_state, recompute_block=[]
            )
            loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
            print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
            # The result of the recompute_loss should be the same as the normal_loss.
    r-   T)popr   r7   rY   r   rg   apply)functionrE   rh   Zpreserver   r   r   	recompute  s   g
ro   c           
      O   s   |  dd}|  dd}dd }t|tjjrt| }t|| }d}td||d  |D ]}	|	| d }t	||	||g|R d|i|}q/||d t|d ||S )	a  
    recompute intermediate activations to save then memory for 'Sequential' models.

    Parameters:
        ctx(dict): include 'segments' and  'preserve_rng_state' keys, the key 'segments' (int, default 1), represents the number of chunks to create in the model,
                   the key 'preserve_rng_state' (bool, optional, default=True) indicate whether to save the forward rng. If it is True, then the last forward rng value will be
                   restored when the forward recalculation of backpropagation is performed. and some keys such as 'mp_group', 'offload' and 'partition' are invalid here,
                   they are useful in 'recompute_hybrid' API.
        functions(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
              whose intermediate activations will be released to save memory in forward stage and will be recomputed
              in backward stage for gradient calculation.
        *args(Tensor): inputs(tuple) to the function.
        **kwargs(Dict): inputs(dict) to the function.

    Returns:
        Output of function on args and kwargs.

    Examples:
        .. code-block:: python

            model = paddle.nn.Sequential(...)
            input = recompute_sequential({'segments' : 1}, model, input)
    segments   r-   Tc                    s    fdd}|S )Nc                    s$   t  d D ]}| | } q| S )Nrq   )r]   )inputrG   beginendfuncsr   r   do_run  s   z7recompute_sequential.<locals>._run_func.<locals>.do_runr   )rt   ru   rv   rw   r   rs   r   	_run_func  s   z'recompute_sequential.<locals>._run_funcr   )
getr
   r   nnZ
SequentialrW   childrenr\   r]   ro   )
rD   Z	functionsrE   rh   rp   r-   rx   Zsegment_sizeru   rt   r   r   r   recompute_sequential  s&   
r}   )r   Zpaddle.fluidr   Zpaddle.autogradr   Zpaddle.autograd.py_layerr   r   
contextlibZpaddle.fluid.frameworkr   loggingZutils.log_utilr   __all__r   r   contextmanagerr%   r'   rg   ro   r}   r   r   r   r   <module>   s(   
  'o