o
    Qe                     @   s  d dl Z ddlmZ ddlmZ ddlmZmZmZ ddlm	Z	 ddlm
Z
 dd	lmZmZ dd
lmZ ddlmZmZmZ d dlZd dlZd dlZddlmZ ddlmZ d dlmZmZ ddlmZ d dlmZ d dl m!Z!m"Z" g Z#G dd deZ$G dd de$Z%G dd de$Z&G dd de$Z'G dd deZ(G dd deZ)G dd deZ*G d d! d!e*Z+G d"d# d#e*Z,G d$d% d%e*Z-G d&d' d'e*Z.G d(d) d)eZ/dS )*    N   )	BatchNorm)SpectralNorm)get_default_dtypeset_default_dtype_non_static_mode   )Constant)	ParamAttr)check_variable_and_dtype
check_type)dygraph_utils)
batch_norm
layer_norminstance_norm)no_grad)
functional)_C_ops_legacy_C_ops)Layer)in_dynamic_mode)in_dygraph_mode_in_legacy_dygraphc                       sF   e Zd ZdZ						d fdd	Zdd	 Zd
d Zdd Z  ZS )_InstanceNormBasez
    This class is based class for InstanceNorm1D, 2d, 3d.

    See InstaceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
    h㈵>?NNCHWc                    s   t t|   |dks|dkr||ksJ d|| _|| _|| _|| _|dkrI|dkrI| j| j|gtddd| _	| j| j|gtddd| _
d S d | _	d | _
d S )NFzOweight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm      ?attrshapedefault_initializeris_bias        T)superr   __init___epsilon_weight_attr
_bias_attr_num_featurescreate_parameterr	   scalebias)selfnum_featuresepsilonmomentumweight_attr	bias_attrdata_formatname	__class__ DD:\Projects\ConvertPro\env\Lib\site-packages\paddle/nn/layer/norm.pyr%   @   s0   


z_InstanceNormBase.__init__c                 C      t d)NzInstanceNorm Base errorNotImplementedErrorr-   inputr7   r7   r8   _check_input_dimf      z"_InstanceNormBase._check_input_dimc                 C   s    |  | t|| j| j| jdS )N)weightr,   Zeps)r>   r   r+   r,   r&   r<   r7   r7   r8   forwardi   s   
z_InstanceNormBase.forwardc                 C      d | j| jS )Nznum_features={}, epsilon={})formatr)   r&   r-   r7   r7   r8   
extra_reprp      z_InstanceNormBase.extra_repr)r   r   NNr   N)	__name__
__module____qualname____doc__r%   r>   rA   rE   __classcell__r7   r7   r5   r8   r   9   s    	&r   c                   @      e Zd ZdZdd ZdS )InstanceNorm1Da  
    Create a callable object of `InstanceNorm1D`. Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .

    DataLayout: NCL `[batch, in_channels, length]`

    :math:`input` is the input features over a mini-batch.

    ..  math::
        
        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

Where `H` means height of feature map, `W` means width of feature map.

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): A value added to the denominator for
            numerical stability. Default is 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
	     If the Initializer of the weight_attr is not set, the parameter is initialized 
	     one. If it is set to False, will not create weight_attr. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
             If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
             If it is set to False, will not create bias_attr. Default: None.
        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Default "NCL".
        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..


    Shape:
        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
        - output: 3-D tensor with same shape as input x.

    Returns:
        None.


    Examples:

        .. code-block:: python

          import paddle

          x = paddle.rand((2, 2, 3))
          instance_norm = paddle.nn.InstanceNorm1D(2)
          instance_norm_out = instance_norm(x)

          print(instance_norm_out)

    c                 C   8   t |jdkrt |jdkrtdt |jd S d S Nr   r   z'expected 2D or 3D input (got {}D input)lenr    
ValueErrorrC   r<   r7   r7   r8   r>         zInstanceNorm1D._check_input_dimNrG   rH   rI   rJ   r>   r7   r7   r7   r8   rM   v   s    <rM   c                   @   rL   )InstanceNorm2Da
  
    Create a callable object of `InstanceNorm2D`. Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .

    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`


    :math:`input` is the input features over a mini-batch.

    ..  math::
        
        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

Where `H` means height of feature map, `W` means width of feature map.

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): A value added to the denominator for
            numerical stability. Default is 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
	     If the Initializer of the weight_attr is not set, the parameter is initialized 
	     one. If it is set to False, will not create weight_attr. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
             If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
             If it is set to False, will not create bias_attr. Default: None.
        data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 4-D tensor with shape: (batch, num_features, height, weight).
        - output: 4-D tensor with same shape as input x.

    Returns:
        None.


    Examples:

        .. code-block:: python

          import paddle

          x = paddle.rand((2, 2, 2, 3))
          instance_norm = paddle.nn.InstanceNorm2D(2)
          instance_norm_out = instance_norm(x)

          print(instance_norm_out)
    c                 C   &   t |jdkrtdt |jd S N   z!expected 4D input (got {}D input)rP   r<   r7   r7   r8   r>      
   zInstanceNorm2D._check_input_dimNrT   r7   r7   r7   r8   rU          ;rU   c                   @   rL   )InstanceNorm3Da  
    Create a callable object of `InstanceNorm3D`. Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .

    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`


    :math:`input` is the input features over a mini-batch.

    ..  math::
        
        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

Where `H` means height of feature map, `W` means width of feature map.

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): A value added to the denominator for
            numerical stability. Default is 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
	     If the Initializer of the weight_attr is not set, the parameter is initialized 
	     one. If it is set to False, will not create weight_attr. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
             If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
             If it is set to False, will not create bias_attr. Default: None.
        data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW.
        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
        - output: 5-D tensor with same shape as input x.

    Returns:
        None.


    Examples:

        .. code-block:: python

          import paddle

          x = paddle.rand((2, 2, 2, 2, 3))
          instance_norm = paddle.nn.InstanceNorm3D(2)
          instance_norm_out = instance_norm(x)

          print(instance_norm_out.numpy)
    c                 C   rV   N   z!expected 5D input (got {}D input)rP   r<   r7   r7   r8   r>   ;  rY   zInstanceNorm3D._check_input_dimNrT   r7   r7   r7   r8   r[      rZ   r[   c                       s<   e Zd ZdZ					d fdd	Zdd Zd	d
 Z  ZS )	GroupNormaf  
    This interface is used to construct a callable object of the ``GroupNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Group Normalization Layer.
    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .

    Parameters:
        num_groups(int): The number of groups that divided from channels.
        num_channels(int): The number of channels of input.
        epsilon(float, optional): The small value added to the variance to prevent
                                  division by zero. Default: 1e-05.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
                                         If it is set to None, the bias is initialized one. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
                                        If it is set to None, the bias is initialized zero. Default: None.
        data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: Tensor with shape: (batch, num_features, *).
        - output: The same shape as input x.

    Returns:
        None

    Examples:
        .. code-block:: python

            import paddle

            x = paddle.arange(48, dtype="float32").reshape((2, 6, 2, 2))
            group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
            group_norm_out = group_norm(x)

            print(group_norm_out)
    r   Nr   c           	         s   t t|   || _|| _|| _|| _|| _|dkr td| | jg}|dkr8| j	d |t
dd| _d| j_n| j	| j|t
dd| _| jd koN| jjdk| j_|dkrg| j	d |t
ddd| _d| j_d S | j	| j|dd	| _| jd ko{| jjdk| j_d S )
Nr   zunsupported data layout:Fr   r   r    r!   Tr#   r   r   r    r"   )r$   r^   r%   r'   r(   r&   _num_channels_num_groupsrR   r*   r	   r@   stop_gradientlearning_rater,   )	r-   Z
num_groupsZnum_channelsr/   r1   r2   r3   r4   param_shaper5   r7   r8   r%   j  sF   





zGroupNorm.__init__c                 C   s  | j j|jdd}| j j|jdd}t r*t|| j| j| j| j	d}t
j|d dS t rHt|| j| j||d| jd| j		\}}}t
j|d dS d|i}| jd urV| j|d< | jd ur`| j|d	< | j j|jd
}| j jd||||d| j| j	dd | j |d S )NTdtyperc   r   )Zactr/   groupsXBiasScale)rg   
group_norm)YMeanVariance)r/   rh   typeinputsoutputsattrs)_helper"create_variable_for_type_inferencerg   r   r   rl   r@   r,   r&   rb   r   Z_append_activation_in_dygraphr   r   	append_opZappend_activation)r-   r=   mean_outvariance_outZpre_act_rr   Zgroup_norm_outr7   r7   r8   rA     sf   	




zGroupNorm.forwardc                 C   s   d | j| j| jS )Nz*num_groups={}, num_channels={}, epsilon={})rC   rb   ra   r&   rD   r7   r7   r8   rE     s   zGroupNorm.extra_repr)r   NNr   NrG   rH   rI   rJ   r%   rA   rE   rK   r7   r7   r5   r8   r^   B  s    +5>r^   c                       s:   e Zd ZdZ				d
 fdd	Zdd Zdd	 Z  ZS )	LayerNormal	  
    Construct a callable object of the ``LayerNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_

    The formula is as follows:

    ..  math::

        \mu & = \frac{1}{H}\sum_{i=1}^{H} x_i

        \sigma & = \sqrt{\frac{1}{H}\sum_{i=1}^{H}{(x_i - \mu)^2} + \epsilon}

        y & = f(\frac{g}{\sigma}(x - \mu) + b)

    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
    - :math:`H`: the number of hidden units in a layers
    - :math:`\epsilon`: the small value added to the variance to prevent division by zero.
    - :math:`g`: the trainable scale parameter.
    - :math:`b`: the trainable bias parameter.

    Parameters:
        normalized_shape(int|list|tuple): Input shape from an expected input of
            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
            If it is a single integer, this module will normalize over the last dimension
            which is expected to be of that specific size.
        epsilon(float, optional): The small value added to the variance to prevent
            division by zero. Default: 1e-05.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 2-D, 3-D, 4-D or 5-D tensor.
        - output: same shape as input x.

    Returns:
        None

    Examples:

        .. code-block:: python

          import paddle

          x = paddle.rand((2, 2, 2, 3))
          layer_norm = paddle.nn.LayerNorm(x.shape[1:])
          layer_norm_out = layer_norm(x)

          print(layer_norm_out)
    r   Nc                    s   t t|   t|tjr|g}t|| _|| _|| _	|| _
t| jg}|du r-d | _n| j| j	|tdd| _|du rBd | _d S | j| j
|dd| _d S )NFr   r_   Tr`   )r$   r|   r%   
isinstancenumbersIntegrallist_normalized_shaper&   r'   r(   npprodr@   r*   r	   r,   )r-   normalized_shaper/   r1   r2   r4   re   r5   r7   r8   r%     s(   

zLayerNorm.__init__c                 C   s   t || j| j| j| jdS )N)r   r@   r,   r/   )r   r   r@   r,   r&   r<   r7   r7   r8   rA   ?  s   zLayerNorm.forwardc                 C   rB   )Nznormalized_shape={}, epsilon={})rC   r   r&   rD   r7   r7   r8   rE   H  rF   zLayerNorm.extra_repr)r   NNNr{   r7   r7   r5   r8   r|     s    <"	r|   c                       sP   e Zd ZdZ							d fdd	Zdd	 Zd
d Zdd Zdd Z  Z	S )_BatchNormBasez
    BatchNorm base .
    r   r   Nr   c	                    s  t t|   || _|| _|| _|| _t dkrd| _nt | _|g}	|dkr9| j	d |	| jt
dd| _d| j_n| j	| j|	| jt
dd| _| jd koQ| jjdk| j_|dkrk| j	d |	| jt
ddd| _d| j_n| j	| j|	| jdd	| _| jd ko| jjdk| j_d }
d }|d ur|d
 }
|d }| j	| jt|
t
dddd|	d| _d| j_| j	| jt|t
dddd|	d| _d| j_|| _d| _|| _|| _d| _|| _d S )Nfloat16float32Fr   )r   r    rg   r!   Tr#   )r   r    rg   r!   r"   )r   r    rg   r"   _mean	_variance)r4   initializerZ	trainableZdo_model_average)rg   r   r    )r$   r   r%   r)   r'   r(   _use_global_statsr   _dtyper*   r	   r@   rc   rd   r,   r
   r   r   _data_formatZ	_in_place	_momentumr&   Z_fuse_with_relu_name)r-   r.   r0   r/   r1   r2   r3   use_global_statsr4   re   Zmoving_mean_nameZmoving_variance_namer5   r7   r8   r%   S  s   







z_BatchNormBase.__init__c                 C   r9   )NzBatchNorm Base errorr:   r<   r7   r7   r8   r>     r?   z_BatchNormBase._check_input_dimc                 C   r9   )Nz BatchNorm Base data format errorr:   r<   r7   r7   r8   _check_data_format  r?   z!_BatchNormBase._check_data_formatc                 C   sT   |  | j | | | jrtd t|| j| j| j	| j
| j| j| j| j| jd
S )Nz<When training, we now always track global mean and variance.)r@   r,   trainingr0   r/   r3   r   )r   r   r>   r   warningswarnr   r   r   r@   r,   r   r&   r   r<   r7   r7   r8   rA     s$   
z_BatchNormBase.forwardc                 C   sL   d | j| j| j}| jdkr|d | j7 }| jd ur$|d | j7 }|S )Nz(num_features={}, momentum={}, epsilon={}r   , data_format={}	, name={})rC   r)   r   r&   r   r   r-   Zmain_strr7   r7   r8   rE     s   

z_BatchNormBase.extra_repr)r   r   NNr   NN)
rG   rH   rI   rJ   r%   r>   r   rA   rE   rK   r7   r7   r5   r8   r   N  s    gr   c                       @   e Zd ZdZ							d fdd	Zdd	 Zd
d Z  ZS )BatchNorm1Da  
    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When use_global_stats = False, the :math:`\mu_{\beta}`
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:

    ..  math::

        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
        \ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\

    When use_global_stats = True, the :math:`\mu_{\beta}`
    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\

    The normalization function formula is as follows:

    ..  math::

        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the weight_attr is not set, the parameter is initialized with ones. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
            If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Default "NCL".
        use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length) when data_format is "NC" or "NCL",
            (batch, length, num_features) when data_format is "NLC".
        - output: 3-D tensor with same shape as input x.

    Returns:
        None.
    

    Examples:
        .. code-block:: python

          import paddle

          x = paddle.rand((2, 1, 3))
          batch_norm = paddle.nn.BatchNorm1D(1)
          batch_norm_out = batch_norm(x)

          print(batch_norm_out)
    r   r   NNCLc	           	   
      "   t t| |||||||| d S N)r$   r   r%   	r-   r.   r0   r/   r1   r2   r3   r   r4   r5   r7   r8   r%   +     
zBatchNorm1D.__init__c                 C   sD   |dks|dks|dkrd| _ d S |dks|dkrd| _ d S td)Nr   NCr   NHWCNLCz4expected NC , NCL, NLC or None for data_format inputr   rR   r<   r7   r7   r8   r   A  s   

zBatchNorm1D._check_data_formatc                 C   rN   rO   rP   r<   r7   r7   r8   r>   K  rS   zBatchNorm1D._check_input_dim)r   r   NNr   NNrG   rH   rI   rJ   r%   r   r>   rK   r7   r7   r5   r8   r     s    J
r   c                   @   s    e Zd ZdZdd Zdd ZdS )BatchNorm2Da  
    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When use_global_stats = False, the :math:`\mu_{\beta}`
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:

    ..  math::

        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//
        \ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - 
        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\

    When use_global_stats = True, the :math:`\mu_{\beta}`
    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\

    The normalization function formula is as follows:

    ..  math::

        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the weight_attr is not set, the parameter is initialized with ones. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
            If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
        use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 4-D tensor with shape: (batch, num_features, height, weight) when data_format is "NCHW",
            or (batch, height, weight, num_features) when data_format is "NHWC".
        - output: 4-D tensor with same shape as input x.

    Returns:
        None

    Examples:
        .. code-block:: python

          import paddle

          x = paddle.rand((2, 1, 2, 3))
          batch_norm = paddle.nn.BatchNorm2D(1)
          batch_norm_out = batch_norm(x)

          print(batch_norm_out)
    c                 C   s,   |dkr	|| _ d S |dkr|| _ d S td)Nr   r   z+expected NCHW or NHWC for data_format inputr   r<   r7   r7   r8   r     s
   

zBatchNorm2D._check_data_formatc                 C   rV   rW   rP   r<   r7   r7   r8   r>     rY   zBatchNorm2D._check_input_dimN)rG   rH   rI   rJ   r   r>   r7   r7   r7   r8   r   T  s    Fr   c                       r   )BatchNorm3Da  
    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When use_global_stats = False, the :math:`\mu_{\beta}`
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:

    ..  math::

        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
        \ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\

    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\

    The normalization function formula is as follows:

    ..  math::

        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the weight_attr is not set, the parameter is initialized with ones. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
            If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
        data_format(str, optional): Specify the input data format, the data format can be "NCDHW" or "NDHWC. Default: NCDHW.
        use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight) when data_format is "NCDHW",
            or (batch, dims, height, weight, num_features) when data_format is "NDHWC".
        - output: 5-D tensor with same shape as input x.

    Returns:
        None

    Examples:
        .. code-block:: python

          import paddle

          x = paddle.rand((2, 1, 2, 2, 3))
          batch_norm = paddle.nn.BatchNorm3D(1)
          batch_norm_out = batch_norm(x)

          print(batch_norm_out)
    r   r   NNCDHWc	           	   
      r   r   )r$   r   r%   r   r5   r7   r8   r%     r   zBatchNorm3D.__init__c                 C   s<   |dks|dkrd| _ d S |dks|dkrd| _ d S td)Nr   r   r   NDHWCz3expected NCDHW, NDHWC or None for data_format inputr   r<   r7   r7   r8   r     s   

zBatchNorm3D._check_data_formatc                 C   rV   r\   rP   r<   r7   r7   r8   r>     rY   zBatchNorm3D._check_input_dim)r   r   NNr   NNr   r7   r7   r5   r8   r     s    I
r   c                       sJ   e Zd ZdZ						d fdd	Zdd	 Zd
d Zedd Z  Z	S )SyncBatchNorma  
    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
    be used as a normalizer function for other operations, such as conv2d and fully connected 
    operations.
    The data is normalized by the mean and variance of the channel based on whole mini-batch
    , which including data in all gpus.
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.

    When model in training mode, the :math:`\\mu_{\\beta}` 
    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
    Calculated as follows:

    ..  math::

        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
        \ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\

    - :math:`x` : whole mini-batch data in all gpus
    - :math:`m` : the size of the whole mini-batch data

    When model in evaluation mode, the :math:`\\mu_{\\beta}`
    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, 
    which usually got from the pre-trained model). Global statistics calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\

    The formula of normalization is as follows:
 
    ..  math::

        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable scale parameter vector
    - :math:`\beta` : trainable shift parameter vector 

    Note:
        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
        ``list`` to pack the model. 

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with ones. If it is set to False, 
             this layer will not have trainable scale parameter. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
             If it is set to None or one attribute of ParamAttr, this layer
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. If it is set to False, this layer will not 
             have trainable bias parameter. Default: None.

    Shapes:
        input: Tensor that the dimension from 2 to 5.
        output: Tensor with the same shape as input.

    Examples:
        .. code-block:: python

          # required: gpu

          import paddle
          import paddle.nn as nn

          x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')

          if paddle.is_compiled_with_cuda():
              sync_batch_norm = nn.SyncBatchNorm(2)
              hidden1 = sync_batch_norm(x)
              print(hidden1)
              # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
              #        [[[[ 0.26824948,  1.09363246],
              #           [ 0.26824948, -1.63013160]],

              #          [[ 0.80956620, -0.66528702],
              #           [-1.27446556,  1.13018656]]]])
    r   r   Nr   c              
      s"   t t| ||||||d | d S r   )r$   r   r%   )r-   r.   r0   r/   r1   r2   r3   r4   r5   r7   r8   r%   t  s   

zSyncBatchNorm.__init__c                 C   s0   | j dv r
d| _ d S | j dv rd| _ d S td)N)r   r   r   r   r   )r   r   r   r   zMexpected 'NCDHW', 'NDHWC', 'NCL', 'NLC', 'NC', 'NCHW', 'NHWC' for data_formatr   rD   r7   r7   r8   r     s   



z SyncBatchNorm._check_data_formatc                 C   s~  |    | j}| j}t r.t|| j| j| j| j| j| j	| j
| j ddd\}}}}}}|S t rcd| jd| j	d| j d| j
ddddddd	df}tj|| j| j| j| j||g|R  \}}}}}}|S t|d
g dd | j| j	| j | j
ddddd}|g| jg| jg| jg| jgd}| jj| jdd}| jj| jdd}	| j| j}|g|g|g|g|	gd}
| jjd||
|d |S )NFr0   r/   is_testdata_layout
use_mkldnnfuse_with_relur   trainable_statisticsr=   )r   r   Zfloat64r   )r0   r/   r   r   r   r   r   r   )ri   rk   rj   rn   ro   Trf   )rm   ZMeanOutZVarianceOutZ	SavedMeanZSavedVariancesync_batch_normrp   )r   r   r   r   r   Zsync_batch_norm_r@   r,   r   r&   r   r   r   r   r   r   ru   rv   r   rw   )r-   xrx   ry   Zsync_batch_norm_outrz   rt   rr   Z
saved_meanZsaved_variancers   r7   r7   r8   rA     s   
zSyncBatchNorm.forwardc                 C   s  |}t |trw|jdkr t |jts |jjdkr |jjd |j_|jdkr9t |jts9|jjdkr9|jjd |j_t|j|j|j	|j|j|j
|j}|jdkro|jdkrot  |j|_|j|_W d   n1 sjw   Y  |j|_|j|_| D ]\}}||| | q{~|S )a  
        Helper function to convert :class: `paddle.nn.BatchNorm*d` layers in the model to :class: `paddle.nn.SyncBatchNorm` layers.

        Parameters:
            layer(paddle.nn.Layer): model containing one or more `BatchNorm*d` layers.

        Returns:
            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.

        Examples:

            .. code-block:: python
                import paddle
                import paddle.nn as nn

                model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm2D(5))
                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

        NZ_syncF)r}   r   r'   boolr4   r(   r   r)   r   r&   r   r   r   r@   r,   r   r   Znamed_childrenZadd_sublayerconvert_sync_batchnorm)clslayerZlayer_outputr4   Zsublayerr7   r7   r8   r     sD   







z$SyncBatchNorm.convert_sync_batchnorm)r   r   NNr   N)
rG   rH   rI   rJ   r%   r   rA   classmethodr   rK   r7   r7   r5   r8   r     s    ^
hr   c                       s<   e Zd ZdZ					d fdd	Zd	d
 Zdd Z  ZS )LocalResponseNorma  
    Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
    For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_

    See more details in :ref:`api_paddle_nn_functional_local_response_norm` .

    Parameters:
        size (int): The number of channels to sum over.
        alpha (float, optional): The scaling parameter, positive. Default:1e-4
        beta (float, optional): The exponent, positive. Default:0.75
        k (float, optional): An offset, positive. Default: 1.0
        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from:
            If input is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
            the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
            If input is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
            the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
            If input is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
            the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
        name (str, optional): Name for the operation (optional, default is None). For more information,
            please refer to :ref:`api_guide_Name`.

    Shape:
        - input: 3-D/4-D/5-D tensor.
        - output: 3-D/4-D/5-D tensor, the same shape as input.

    Examples:

    .. code-block:: python

        import paddle

        x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
        m = paddle.nn.LocalResponseNorm(size=5)
        y = m(x)
        print(y.shape)  # [3, 3, 112, 112]
    -C6?      ?r   r   Nc                    s6   t t|   || _|| _|| _|| _|| _|| _d S r   )	r$   r   r%   sizealphabetakr3   r4   )r-   r   r   r   r   r3   r4   r5   r7   r8   r%   _  s   	
zLocalResponseNorm.__init__c              	   C   s&   t || j| j| j| j| j| j}|S r   )FZlocal_response_normr   r   r   r   r3   r4   )r-   r=   outr7   r7   r8   rA   p  s   	zLocalResponseNorm.forwardc                 C   sP   d | j| j| j| j}| jdkr|d | j7 }| jd ur&|d | j7 }|S )Nz size={}, alpha={}, beta={}, k={}r   r   r   )rC   r   r   r   r   r3   r4   r   r7   r7   r8   rE   |  s   

zLocalResponseNorm.extra_repr)r   r   r   r   Nr{   r7   r7   r5   r8   r   8  s    )r   )0sixZfluid.dygraphr   r   Z	frameworkr   r   r   r   r	   r
   Zfluid.data_feederr   r   Zfluidr   r   r   r   r   numpyr   r~   r   r    r   Zpaddler   r   r   r   Zpaddle.fluid.frameworkr   r   __all__r   rM   rU   r[   r^   r|   r   r   r   r   r   r   r7   r7   r7   r8   <module>   sF   =FCC "k qVn  "