o
    Meo                     @   sD   d dl mZ d dl mZ d dlmZ g Zdd ZG dd deZdS )	    )	AmpScaler)OptimizerState)defaultdictc                   C   s
   dt jiS )Nstate)r   INIT r   r   FD:\Projects\ConvertPro\env\Lib\site-packages\paddle/amp/grad_scaler.py_refresh_optimizer_state   s   
r	   c                       s  e Zd ZdZ							d0 fdd		Z fd
dZ fddZdd Zdd Z fddZ	 fddZ
 fddZ fddZ fddZ fddZ fddZ fd d!Z fd"d#Z fd$d%Z fd&d'Z fd(d)Z fd*d+Z fd,d-Z fd.d/Z  ZS )1
GradScaleraa
  
    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. 
    It controls the scaling of loss, helps avoiding numerical overflow.
    The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.

    `scale()` is used to multiply the loss by a scale ratio.
    `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`.
    `step()` is similar as `optimizer.step()`, which performs parameters updating.
    `update` is used to update the loss_scaling.


    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in 
    dynamic graph mode.

    Args:
        enable(bool, optional): Enable loss scaling or not. Default is True.
        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
        incr_ratio(float, optional): The multiplier to use when increasing the loss 
                        scaling. Default is 2.0.
        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
                        the loss scaling. Default is 0.5.
        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
                                steps with finite gradients. Default is 1000.
        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
                                    accumulated steps with nan or inf gradients. Default is 2.
        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
    Returns:
        An GradScaler object.

    Examples:

        .. code-block:: python
            
            import paddle

            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
            data = paddle.rand([10, 3, 32, 32])

            with paddle.amp.auto_cast():
                conv = model(data)
                loss = paddle.mean(conv)
                
            scaled = scaler.scale(loss)  # scale the loss 
            scaled.backward()            # do backward
            scaler.minimize(optimizer, scaled)  # update parameters     
            optimizer.clear_grad()
    T      @       @      ?     c              	      s    t t| ||||||| d S )N)superr
   __init__)selfenableZinit_loss_scalingZ
incr_ratioZ
decr_ratioZincr_every_n_stepsZdecr_every_n_nan_or_infZuse_dynamic_loss_scaling	__class__r   r   r   N   s   
zGradScaler.__init__c                       t t| |S )a7  
        Multiplies a Tensor by the scale factor and returns scaled outputs.  
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
            var (Tensor):  The tensor to scale.
        Returns:
            The scaled tensor or original tensor.
        
        Examples:

            .. code-block:: python
                
                import paddle

                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                data = paddle.rand([10, 3, 32, 32])

                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)

                scaled = scaler.scale(loss)  # scale the loss 
                scaled.backward()            # do backward
                scaler.minimize(optimizer, scaled)  # update parameters  
                optimizer.clear_grad()
        )r   r
   scale)r   varr   r   r   r   [   s   zGradScaler.scalec                    s   t t| j|g|R i |S )a  
        This function is similar as `optimizer.minimize()`, which performs parameters updating.
        
        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Finally, the loss scaling ratio is updated.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
            args:  Arguments, which will be forward to `optimizer.minimize()`.
            kwargs: Keyword arguments, which will be forward to `optimizer.minimize()`.

        Examples:

            .. code-block:: python

                import paddle

                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                data = paddle.rand([10, 3, 32, 32])

                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)

                scaled = scaler.scale(loss)  # scale the loss 
                scaled.backward()            # do backward
                scaler.minimize(optimizer, scaled)  # update parameters  
                optimizer.clear_grad()
        )r   r
   minimize)r   	optimizerargskwargsr   r   r   r   {   s   "zGradScaler.minimizec                 C   s   | j s| S | jt| }|d tju rtd|d tju r%| | | j	r,d| _
n|  d| _
tj|d< | jsBtt| _dS dS )a  
        This function is similar as `optimizer.step()`, which performs parameters updating.
        
        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.

        Examples:

            .. code-block:: python
            
                # required: gpu
                import paddle

                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                data = paddle.rand([10, 3, 32, 32])
                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)
                scaled = scaler.scale(loss)  # scale the loss 
                scaled.backward()            # do backward
                scaler.step(optimizer)       # update parameters
                scaler.update()              # update the loss scaling ratio
                optimizer.clear_grad()
        r   z7step() has already been called since the last update().TFN)_enablestep_optimizer_statesidr   ZSTEPPEDRuntimeErrorr   _unscaleZ
_found_infZ_cache_founf_inf_use_dynamic_loss_scalingr   r	   )r   r   Zoptimizer_stater   r   r   r      s"   

zGradScaler.stepc                 C   s&   | j sdS | jr|   tt| _dS )a  
        Updates the loss_scaling.
        
        Examples:

            .. code-block:: python
            
                # required: gpu
                import paddle

                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                data = paddle.rand([10, 3, 32, 32])
                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)
                scaled = scaler.scale(loss)     # scale the loss 
                scaled.backward()               # do backward
                scaler.step(optimizer)          # update parameters
                scaler.update()                 # update the loss scaling ratio
                optimizer.clear_grad() 
        N)r   r#   _updater   r	   r   r   r   r   r   update   s   
zGradScaler.updatec                    r   )a  
        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.

        Returns:
            The unscaled parameters or original parameters.
        
        Examples:

            .. code-block:: python

                # required: gpu
                import paddle

                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                data = paddle.rand([10, 3, 32, 32])
                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)
                scaled = scaler.scale(loss)  # scale the loss 
                scaled.backward()            # do backward
                scaler.unscale_(optimizer)    # unscale the parameter
                scaler.step(optimizer)
                scaler.update()  
                optimizer.clear_grad() 
        )r   r
   r"   )r   r   r   r   r   unscale_   s    zGradScaler.unscale_c                       t t|  S )a7  
        Enable loss scaling or not.

        Returns:
            bool: enable loss scaling return True else return False.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                enable = scaler.is_enable()
                print(enable) # True
        )r   r
   	is_enabler%   r   r   r   r)        zGradScaler.is_enablec                    r(   )a  
        Whether to use dynamic loss scaling.

        Returns:
            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu         
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                use_dynamic_loss_scaling = scaler.is_use_dynamic_loss_scaling()
                print(use_dynamic_loss_scaling) # True
        )r   r
   is_use_dynamic_loss_scalingr%   r   r   r   r+   -  r*   z&GradScaler.is_use_dynamic_loss_scalingc                    r(   )aT  
        Return the initial loss scaling factor.

        Reurns:
            float:  the initial loss scaling factor.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                init_loss_scaling = scaler.get_init_loss_scaling()
                print(init_loss_scaling) # 1024
        )r   r
   get_init_loss_scalingr%   r   r   r   r,   E  r*   z GradScaler.get_init_loss_scalingc                       t t| | dS )a.  
        Set the initial loss scaling factor by `new_init_loss_scaling`.

        Args:
            new_init_loss_scaling(float):  The new_init_loss_scaling used to update initial loss scaling factor.
        
        Examples:
            .. code-block:: python
                
                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                print(scaler.get_init_loss_scaling()) # 1024
                new_init_loss_scaling = 1000
                scaler.set_init_loss_scaling(new_init_loss_scaling)
                print(scaler.get_init_loss_scaling()) # 1000
        N)r   r
   set_init_loss_scaling)r   Znew_init_loss_scalingr   r   r   r.   ]     z GradScaler.set_init_loss_scalingc                    r(   )al  
        Return the multiplier to use when increasing the loss scaling.

        Reurns:
            float:  the multiplier to use when increasing the loss scaling.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                incr_ratio = scaler.get_incr_ratio()
                print(incr_ratio) # 2.0
        )r   r
   get_incr_ratior%   r   r   r   r0   w  r*   zGradScaler.get_incr_ratioc                    r-   )a4  
        Set the multiplier to use when increasing the loss scaling by `new_incr_ratio`, `new_incr_ratio` should > 1.0.

        Args:
            new_incr_ratio(float):  The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                print(scaler.get_incr_ratio()) # 2.0
                new_incr_ratio = 3.0
                scaler.set_incr_ratio(new_incr_ratio)
                print(scaler.get_incr_ratio()) # 3.0
        N)r   r
   set_incr_ratio)r   Znew_incr_ratior   r   r   r1     r/   zGradScaler.set_incr_ratioc                    r(   )a  
        Get the less-than-one-multiplier to use when decreasing the loss scaling.

        Reurns:
            float:  the less-than-one-multiplier to use when decreasing the loss scaling.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                decr_ratio = scaler.get_decr_ratio()
                print(decr_ratio) # 0.5
        )r   r
   get_decr_ratior%   r   r   r   r2     r*   zGradScaler.get_decr_ratioc                    r-   )aP  
        Set the less-than-one-multiplier to use when decreasing the loss scaling by `new_incr_ratio`, `new_decr_ratio` should < 1.0.

        Args:
            new_decr_ratio(float):  The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                print(scaler.get_decr_ratio()) # 0.5
                new_decr_ratio = 0.1
                scaler.set_decr_ratio(new_decr_ratio)
                print(scaler.get_decr_ratio()) # 0.1
        N)r   r
   set_decr_ratio)r   Znew_decr_ratior   r   r   r3     r/   zGradScaler.set_decr_ratioc                    r(   )a  
        Return the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Reurns:
            int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                incr_every_n_steps = scaler.get_incr_every_n_steps()
                print(incr_every_n_steps) # 1000
        )r   r
   get_incr_every_n_stepsr%   r   r   r   r4     r*   z!GradScaler.get_incr_every_n_stepsc                    r-   )a  
        Set the num `n` by `new_incr_every_n_steps`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Args:
            new_incr_every_n_steps(int):  The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                print(scaler.get_incr_every_n_steps()) # 1000
                new_incr_every_n_steps = 2000
                scaler.set_incr_every_n_steps(new_incr_every_n_steps)
                print(scaler.get_incr_every_n_steps()) # 2000
        N)r   r
   set_incr_every_n_steps)r   Znew_incr_every_n_stepsr   r   r   r5     r/   z!GradScaler.set_incr_every_n_stepsc                    r(   )a  
        Return the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Reurns:
            int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                decr_every_n_nan_or_inf = scaler.get_decr_every_n_nan_or_inf()
                print(decr_every_n_nan_or_inf) # 2
        )r   r
   get_decr_every_n_nan_or_infr%   r   r   r   r6     r*   z&GradScaler.get_decr_every_n_nan_or_infc                    r-   )a  
        Set the num `n` by `new_decr_every_n_nan_or_inf`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Args:
            new_decr_every_n_nan_or_inf(int):  The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
        
        Examples:
            .. code-block:: python

                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                print(scaler.get_decr_every_n_nan_or_inf()) # 2
                new_decr_every_n_nan_or_inf = 3
                scaler.set_decr_every_n_nan_or_inf(new_decr_every_n_nan_or_inf)
                print(scaler.get_decr_every_n_nan_or_inf()) # 3
        N)r   r
   set_decr_every_n_nan_or_inf)r   Znew_decr_every_n_nan_or_infr   r   r   r7   %  s   z&GradScaler.set_decr_every_n_nan_or_infc                    r(   )an  
        Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict.

        Reurns:
            A dict of scaler includes:
            scale (tensor): The loss scaling factor.
            incr_ratio(float): The multiplier to use when increasing the loss scaling.
            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling.
            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients.
            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients.
            incr_count(int): The number of recent consecutive unskipped steps.
            decr_count(int): The number of recent consecutive skipped steps.
            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.

        
        Examples:

            .. code-block:: python

                # required: gpu,xpu
                import paddle

                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                scaler_state = scaler.state_dict()
        )r   r
   
state_dictr%   r   r   r   r8   @  s    zGradScaler.state_dictc                    r-   )a  
        Loads the scaler state.
        
        Args:
           state_dict(dict): scaler state.  Should be an object returned from a call to `GradScaler.state_dict()`.
                
        Examples:

            .. code-block:: python

                # required: gpu,xpu
                import paddle

                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
                                               incr_ratio=2.0,
                                               decr_ratio=0.5,
                                               incr_every_n_steps=1000,
                                               decr_every_n_nan_or_inf=2,
                                               use_dynamic_loss_scaling=True)
                scaler_state = scaler.state_dict()
                scaler.load_state_dict(scaler_state)
        N)r   r
   load_state_dict)r   r8   r   r   r   r9   b  r/   zGradScaler.load_state_dict)Tr   r   r   r   r   T)__name__
__module____qualname____doc__r   r   r   r   r&   r'   r)   r+   r,   r.   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   __classcell__r   r   r   r   r
      s:    4 $5""r
   N)Zpaddle.fluid.dygraph.ampr   r   collectionsr   __all__r	   r
   r   r   r   r   <module>   s   