o
    Ne4U                    @   s  d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z
 ddlmZ d	d
lmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZmZ ddlmZ ddlmZm Z m!Z! ddlm"Z" d	dl#m$Z$ ddlmZmZ d dl%Z&d dl'Z'd dl(Z(d dl)Z)d dl*m  m+Z+ d dlm,Z,m-Z- g dZ.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd dej/Z4G dd dej/Z5G d d! d!ej/Z6G d"d# d#ej/Z7G d$d% d%ej/Z8G d&d' d'ej/Z9G d(d) d)ej/Z:G d*d+ d+ej/Z;G d,d- d-ej/Z<G d.d/ d/ej/Z=G d0d1 d1ej/Z>G d2d3 d3ej/Z?G d4d5 d5ej/Z@G d6d7 d7ej/ZAG d8d9 d9ej/ZBG d:d; d;ej/ZCG d<d= d=ej/ZDdS )>    )print_functionN)reduce   )core)utils)nn)dygraph_utils   )layers)
Variable_non_static_modeOpProtoHolder	Parameter_dygraph_tracer_varbase_creatordefault_main_program_global_flagsin_dygraph_mode_in_legacy_dygraph)convert_dtypecheck_variable_and_dtype
check_typecheck_dtype)	ParamAttr)NormalConstantNumpyArrayInitializer)unique_name)LayerObjectHelper)r   r   )_C_ops_legacy_C_ops)Conv2DConv3DPool2DLinear	BatchNormDropout	EmbeddingGRUUnitInstanceNorm	LayerNormNCEPReluBilinearTensorProductConv2DTransposeConv3DTranspose	GroupNormSpectralNormTreeConvFlattenc                       <   e Zd ZdZ									d fdd	Zd	d
 Z  ZS )r!   a8  
    This interface is used to construct a callable object of the ``Conv2D`` class.
    For more details, refer to code examples.
    The convolution2D layer calculates the output based on the input, filter
    and strides, paddings, dilations, groups parameters. Input and
    Output are in NCHW format, where N is batch size, C is the number of
    the feature map, H is the height of the feature map, and W is the width of the feature map.
    Filter's shape is [MCHW] , where M is the number of output feature map,
    C is the number of input feature map, H is the height of the filter,
    and W is the width of the filter. If the groups is greater than 1,
    C will equal the number of input feature map divided by the groups.
    Please refer to UFLDL's `convolution
    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
    for more details.
    If bias attribution and activation type are provided, bias is added to the
    output of the convolution, and the corresponding activation function is
    applied to the final result.

    For each input :math:`X`, the equation is:

    .. math::

        Out = \\sigma (W \\ast X + b)

    Where:

    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.

    Example:

        - Input:

          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`

          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`

        - Output:

          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`

        Where

        .. math::

            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1

    Parameters:
        num_channels(int): The number of channels in the input image.
        num_filters(int): The number of filter. It is as same as the output
            feature map.
        filter_size (int or tuple): The filter size. If filter_size is a tuple,
            it must contain two integers, (filter_size_H, filter_size_W).
            Otherwise, the filter will be a square.
        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. Default: 1.
        padding (int or tuple, optional): The padding size. If padding is a tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: 0.
        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: 1.
        groups (int, optional): The groups number of the Conv2D Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
            the first half of the filters is only connected to the first half
            of the input channels, while the second half of the filters is only
            connected to the second half of the input channels. Default: 1.
        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv2d
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True.
        act (str, optional): Activation type, if it is set to None, activation is not appended.
            Default: None.
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of filter of this layer.

        **bias** (Parameter or None): the learnable bias of this layer.

    Returns:
        None
    
    Raises:
        ValueError: if ``use_cudnn`` is not a bool value.

    Examples:
        .. code-block:: python

          from paddle.fluid.dygraph.base import to_variable
          import paddle.fluid as fluid
          from paddle.fluid.dygraph import Conv2D
          import numpy as np

          data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
          with fluid.dygraph.guard():
              conv2d = Conv2D(3, 2, 3)
              data = to_variable(data)
              conv = conv2d(data)

    r	   r   NTfloat32c                    s  |dusJ dt t  t rtjdd rd}
|_|_	t
|dd_t
|dd_t
|dd_|_t|
tsGtd|
_t d	 _ _|_|_|	_|_jj	krv|j d
krvjsvjsvd_nd_t rjj	krjjkrd_nd_|_j	d u rj}njj	 d
krtdjj	 }t
jdd j|g  } fdd}jj|j| d_jjjgjdd_d S )NF$param_attr should not be False here.ZFLAGS_conv2d_disable_cudnnr   stridepaddingdilation!use_cudnn should be True or FalseFLAGS_use_mkldnnr   depthwise_conv2dconv2d)num_channels must be divisible by groups.filter_sizec                     s.    d  d  j  } d|  d }td|dS )Nr   r	          @      ?        _num_channelsr   Zfilter_elem_numZstdr?   self GD:\Projects\ConvertPro\env\Lib\site-packages\paddle/fluid/dygraph/nn.py_get_default_param_initializer   s   z7Conv2D.__init__.<locals>._get_default_param_initializerattrshapedtypedefault_initializerTrL   rM   rN   is_bias) superr!   __init__r   Zis_compiled_with_cudapaddleZfluidZ	get_flagsrD   _groupsr   convert_to_list_stride_padding	_dilation_act
isinstancebool
ValueError
_use_cudnnr   _use_mkldnn_filter_size_num_filters_param_attr
_bias_attr_dtype_l_typeZis_compiled_with_npucreate_parameterweightbiasrG   num_channelsnum_filtersr?   r7   r8   r9   groups
param_attr	bias_attr	use_cudnnactrN   Znum_filter_channelsfilter_shaperJ   	__class__rF   rI   rS      sp   

zConv2D.__init__c                 C   s  t  r<| jdkr<t|| j| j| jd| jr| jnd| jdddd}| j	d ur0t
j|| j	dd}n|}tj|| j| jdS t r| jdksI| jd	krd
| jd| jd| jd| jrY| jndd| jd| jf}tj|| jg|R  }|}tj|| j	d| jd}tj|| j| jdS |g| jgd}| j| j| j| jr| jnd| j| jd}t|dg dd | jj| jd}| jj| j|| jdd|i|d | j	d ur| jj| jd}| jjd|g| j	gdd|gid| jdd n|}| jj|| jdS )Nr=   EXPLICITr	   NCHWF)axis
use_mkldnnr<   stridespaddings	dilationsrl   ro   ry   InputFilterrz   r{   r|   rl   ro   ry   inputfloat16r5   float64r!   rN   Outputtypeinputsoutputsattrselementwise_addXYOutrw   ry   rp   )r   re   r   r=   rg   rW   rX   rU   rY   rh   Fr   r   _append_activation_in_dygraphrZ   r_   r   r^   r    _append_bias_in_dygraphr   _helper"create_variable_for_type_inferencerd   	append_opappend_activation)rG   r   pre_biaspre_actr   outr   rH   rH   rI   forward   s   




	

zConv2D.forward	r	   r   r	   NNNTNr5   __name__
__module____qualname____doc__rS   r   __classcell__rH   rH   rr   rI   r!   .   s    xNr!   c                       r4   )r"   aH  
    **Convlution3D Layer**

    The convolution3D layer calculates the output based on the input, filter
    and strides, paddings, dilations, groups parameters. Input(Input) and
    Output(Output) are multidimensional tensors with a shape of 
    :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
    channels, D is the depth of the feature, H is the height of the feature,
    and W is the width of the feature. Convlution3D is similar with Convlution2D
    but adds one dimension(depth). If bias attribution and activation type are
    provided, bias is added to the output of the convolution, and the
    corresponding activation function is applied to the final result.

    For each input :math:`X`, the equation is:

    .. math::

        Out = \sigma (W \\ast X + b)

    In the above equation:

    * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
    * :math:`W`: Filter value, a tensor with MCDHW format.
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.

    Example:

        - Input:

          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`

          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`

        - Output:
          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`

        Where

        .. math::

            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1

    Parameters:
        num_channels(int): The number of channels in the input image.
        num_filters(int): The number of filter. It is as same as the output image channel.
        filter_size (int|tuple, optional): The filter size. If filter_size is a tuple,
            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
            Otherwise, the filter will be a square, filter_size_depth = filter_size_height
            = filter_size_width = filter_size.
        stride (int|tuple, optional): The stride size. If stride is a tuple, it must
            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
            stride_D = stride_H = stride_W = stride. The default value is 1.
        padding (int|tuple, optional): The padding size. If padding is a tuple, it must
            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
            padding_D = padding_H = padding_W = padding. The default value is 0.
        dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must
            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
        groups (int, optional): The groups number of the Conv3D Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
            the first half of the filters is only connected to the first half
            of the input channels, while the second half of the filters is only
            connected to the second half of the input channels. The default value is 1.
        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
            will create ParamAttr as param_attr. If it is set to None, the parameter
            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv3d
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. The default value is None.
        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. The default value is True.
        act (str, optional): Activation type, if it is set to None, activation is not appended.
            The default value is None.
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of filters of this layer.

        **bias** (Parameter): the learnable bias of this layer.

    Returns:
        None.

    Raises:
        ValueError: If the shapes of input, filter_size, stride, padding and
                    groups mismatch.

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          with fluid.dygraph.guard():
              data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
              conv3d = fluid.dygraph.nn.Conv3D(
                    num_channels=3, num_filters=2, filter_size=3, act="relu")
              ret = conv3d(fluid.dygraph.base.to_variable(data))

    r	   r   NTr5   c                    s   |dusJ dt t  |_|_t|dd_t|dd_t|dd_	|_
|
_ _|_|_|	_|_jd u rKj}njj dkrWtdjj }tjdd	 j|g  } fd
d}jj|j| d_jjjgjdd_d S )NFr6      r7   r8   r9   r   r>   r?   c                     s6    d  d   d  j  } d|  d }td|dS )Nr   r	   r   r@   rA   rB   rC   rE   rF   rH   rI   rJ     s   z7Conv3D.__init__.<locals>._get_default_param_initializerrK   TrP   )rR   r"   rS   rD   rU   r   rV   rW   rX   rY   rZ   r^   r`   ra   rb   rc   rd   r]   rf   rg   rh   ri   rr   rF   rI   rS     sB   
zConv3D.__init__c                 C   s   | j j| jd}| j jd|| jdd|i| j| j| j| jr | jnd| j	ddd | j
d urL| j j| jd}| j jd	|g| j
gd
d|giddid n|}| j j|| jdS )Nr   Zconv3dr}   r   r	   Fr   r   r   r   r   rw   r   )r   r   rd   r   rg   rW   rX   rY   rU   r^   rh   r   rZ   rG   r   r   r   rH   rH   rI   r     s:   
zConv3D.forwardr   r   rH   rH   rr   rI   r"   9  s    r7r"   c                       s<   e Zd ZdZ									d fdd	Zd	d
 Z  ZS )r/   a  
    **Convlution3D transpose layer**

    The convolution3D transpose layer calculates the output based on the input,
    filter, and dilations, strides, paddings. Input(Input) and output(Output)
    are in NCDHW format. Where N is batch size, C is the number of channels,
    D is the depth of the feature, H is the height of the feature, and W
    is the width of the feature. Parameters(dilations, strides, paddings) are
    two elements. These two elements represent height and width, respectively.
    The details of convolution transpose layer, please refer to the following
    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
    If bias attribution and activation type are provided, bias is added to
    the output of the convolution, and the corresponding activation function
    is applied to the final result.

    For each input :math:`X`, the equation is:

    .. math::

        Out = \sigma (W \\ast X + b)

    In the above equation:

    * :math:`X`: Input value, a tensor with NCDHW format.
    * :math:`W`: Filter value, a tensor with MCDHW format.
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.

    Example:

        - Input:

          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`

          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`

        - Output:

          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`

        Where

        .. math::

           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\

    **Note**:

          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
          when stride > 1, conv3d maps multiple input shape to the same output shape, 
          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
          If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
          conv3d_transpose can compute the kernel size automatically.


    Parameters:
        num_channels(int): The number of channels in the input image.
        num_filters(int): The number of the filter. It is as same as the output
            image channel.
        filter_size(int|tuple): The filter size. If filter_size is a tuple,
            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
            Otherwise, the filter will be a square.
        padding(int|tuple, optional): The padding size. The padding argument effectively
             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
            and when `data_format` is `'NCDHW'`, `padding` can be in the form
            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
            when `data_format` is `'NDHWC'`, `padding` can be in the form
            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            The default value is 0.
        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
            The default value is 1.
        dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
        groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
            when group=2, the first half of the filters is only connected to the
            first half of the input channels, while the second half of the
            filters is only connected to the second half of the input channels.
            The default value is 1.
        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. The default value is None.
        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv3d_transpose
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. The default value is None.
        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. The default value is True.
        act (str, optional): Activation type, if it is set to None, activation is not appended.
            The default value is None.
        name(str, optional): The default value is None. Normally there is no need for user 
            to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Attribute:
        **weight** (Parameter): the learnable weights of filters of this layer.

        **bias** (Parameter): the learnable bias of this layer.

    Returns:
        None.

    Raises:
        ValueError: If the shapes of input, filter_size, stride, padding and
                    groups mismatch.

    Examples:
       .. code-block:: python

         import paddle.fluid as fluid
         import numpy

         with fluid.dygraph.guard():
             data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
             conv3dTranspose = fluid.dygraph.nn.Conv3DTranspose(
                    num_channels=3,
                    num_filters=12,
                    filter_size=12,
                    use_cudnn=False)
             ret = conv3dTranspose(fluid.dygraph.base.to_variable(data))

    r   r	   NTr5   c                    s  t t|   t|
tstd|dusJ dt|dd| _t|dd| _	t|dd| _
|| _|| _|| _|d u r?dn|| _|| _|
| _|	| _|| _|| _t| jdd	| _| j| j| j g| j }| j| j|| jd
| _| j| j| jg| jdd| _d S )Nr:   Fz3param_attr should not be False in conv3d_transpose.r   r8   r7   r9   r	   zconv3d_transpose.filter_sizerN   rM   rL   TrP   )rR   r/   rS   r[   r\   r]   r   rV   rX   rW   rY   rb   rD   r`   rU   ra   r^   rc   rZ   rd   rf   rg   rh   )rG   rj   rk   r?   r8   r7   r9   rl   rm   rn   ro   rp   rN   rq   rr   rH   rI   rS     s>   
zConv3DTranspose.__init__c              
   C   s   | j j| jd}| j jd|g| jgdd|i| j| j| j| jr"| jnd| j	dd | j
rK| j j| jd}| j jd|g| jgd	d
|giddid n|}| j j|| jdS )Nr   Zconv3d_transposer}   r   r	   )rz   r{   r|   rl   ro   r   r   r   r   rw   r   )r   r   rd   r   rg   rW   rX   rY   rU   r^   rc   rh   r   rZ   r   rH   rH   rI   r     s8   zConv3DTranspose.forward)	r   r	   r	   NNNTNr5   r   rH   rH   rr   rI   r/     s     +r/   c                       s<   e Zd ZdZ									d fd	d
	Zdd Z  ZS )r#   a6  

    This interface is used to construct a callable object of the ``Pool2D`` class.
    For more details, refer to code examples.
    The pooling2d operation calculates the output based on the input, pool_type and pool_size, pool_stride,
    pool_padding parameters.Input and output are in NCHW format, where N is batch size, C is the number of feature map,
    H is the height of the feature map, and W is the width of the feature map.
    Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively.
    The input(X) size and output(Out) size may be different.

    Example:

        - Input:

          Input shape: :math:`(N, C, H_{in}, W_{in})`

        - Output:

          Output shape: :math:`(N, C, H_{out}, W_{out})`

        If ``ceil_mode`` = False:

        .. math::

            H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\\\
            W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1

        If ``ceil_mode`` = True:

        .. math::

            H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\\\
            W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1

        If ``exclusive`` = False:

        .. math::

            hstart &= i * strides[0] - paddings[0] \\\\
            hend   &= hstart + ksize[0] \\\\
            wstart &= j * strides[1] - paddings[1] \\\\
            wend   &= wstart + ksize[1] \\\\
            Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}

        If ``exclusive`` = True:

        .. math::

            hstart &= max(0, i * strides[0] - paddings[0])\\\\
            hend &= min(H, hstart + ksize[0]) \\\\
            wstart &= max(0, j * strides[1] - paddings[1]) \\\\
            wend & = min(W, wstart + ksize[1]) \\\\
            Output(i ,j) & = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}

    Parameters:
        pool_size (int or list or tuple, optional): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two integers, (pool_size_Height, pool_size_Width).
            Otherwise, the pool kernel size will be a square of an int. Default: -1.
        pool_type(str, optional) : The pooling type, can be "max" for max-pooling and "avg" for average-pooling. 
            Default: max.
        pool_stride (int or list or tuple, optional): The pool stride size. If pool stride size is a tuple or list,
            it must contain two integers, (pool_stride_Height, pool_stride_Width). Otherwise,
            the pool stride size will be a square of an int. Default: 1.
        pool_padding (int or list or tuple, optional): The padding size for pooling operation. 
            If ``pool_padding`` is a tuple,
            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
            Otherwise, the padding size for pooling operation will be a square of an int. Default: 0.
        global_pooling (bool, optional): Whether to use the global pooling. If global_pooling = true,
            kernel size and paddings will be ignored. Default: False.
        use_cudnn (bool, optional): Only used in cudnn kernel, need install cudnn. Default: True.
        ceil_mode (bool, optional): Whether to use the ceil function to calculate output height and width.
            False is the default. If it is set to False, the floor function will be used. Default: False.
        exclusive (bool, optional): Whether to exclude padding points in average pooling mode. Default: True.
        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            ``[batch_size, input_channels, input_height, input_width]``. When it is `"NHWC"`, the data is 
            stored in the order of: ``[batch_size, input_height, input_width, input_channels]``

    Returns:
        None

    Raises:
        ValueError: If ``pool_type`` is not "max" nor "avg".
        ValueError: If ``global_pooling`` is False and ``pool_size`` is -1.
        ValueError: If ``use_cudnn`` is not a bool value.
        ValueError: If ``data_format`` is not "NCHW" nor "NHWC".

    Examples:

        .. code-block:: python

          import paddle.fluid as fluid
          from paddle.fluid.dygraph.base import to_variable
          import numpy as np

          with fluid.dygraph.guard():
             data = numpy.random.random((3, 32, 32, 5)).astype('float32')
             pool2d = fluid.dygraph.Pool2D(pool_size=2,
                            pool_type='max',
                            pool_stride=1,
                            global_pooling=False)
             pool2d_res = pool2d(to_variable(data))

    rv   maxr	   r   FTru   c
           
         s   |	  }	| }|dvrtdt||du r#|dkr#tdt| t|ts,tdt d | _|	dvr>td	t|	 tt	| 
  || _t|d
d| _t|d
d| _t|d
d| _|| _|| _|| _|| _|	| _d| _d S )N)r   Zavgz7Unknown pool_type: '%s'. It can only be 'max' or 'avg'.Frv   zeWhen the global_pooling is False, pool_size must be passed and be a valid value. Received pool_size: r:   r;   )ru   ZNHWCzMAttr(data_format) should be 'NCHW' or 'NHWC'. Received Attr(data_format): %s.r   	pool_sizepool_paddingpool_stridepool2d)upperlowerr]   strr[   r\   r   r_   rR   r#   rS   
_pool_typer   rV   
_pool_size_pool_padding_pool_stride_global_poolingr^   
_ceil_mode
_exclusive_data_formatre   )
rG   r   Z	pool_typer   r   global_poolingro   	ceil_mode	exclusivedata_formatrr   rH   rI   rS   I  sF   


zPool2D.__init__c                 C   s  t  rK| js"t r"t|| j| j| j| j| j	| j
| j| jdd| jS d| jd| jd| jd| jd| jd| jd	| jd
| jd| j	d| j
f}tj|g|R  S t|dg dd | j| j| j| j| j| j| j| j| j	| j
d
}d|gi}| j| j}| jj| jd|id|i|d |S )NFrt   pooling_typeksizer   rz   r{   ro   r   ry   r   r   r   )int8uint8r   r5   r   r#   )
r   r   r   rz   r{   ro   r   ry   r   r   r   r   r   )r   r_   r   r   r   r   r   r   r   r   r   r   r   r^   r    r   r   r   rd   r   re   )rG   r   r   r   Zpool_outrH   rH   rI   r   w  sN   




zPool2D.forward)	rv   r   r	   r   FTFTru   r   rH   rH   rr   rI   r#     s    j.r#   c                       2   e Zd ZdZ				d fdd	Zdd Z  ZS )	r$   a  
    
    Fully-connected linear transformation layer:

    .. math::

        Out = Act({XW + b})

    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.

    Linear layer takes only one ``Tensor`` input.
    The Linear layer multiplies input tensor with weight matrix and
    produces an output Tensor of shape [N, *, `output_dim`],
    where N is batch size and `*` means any number of additional dimensions.
    If ``bias_attr`` is not None, a bias variable will be created and added to the output.
    Finally, if ``act`` is not None, it will be applied to the output as well.

    Parameters:
        input_dim(int): The number of input units in this layer.
        output_dim(int): The number of output units in this layer.
        param_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
            weights(Parameter) of this layer. Default: None.
        bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias
            of this layer. If it is set to False, no bias will be added to the output units.
            If it is set to None, the bias is initialized zero. Default: None.
        act(str, optional): Activation to be applied to the output of this layer. Default: None.
        dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".

    Attributes:
        **weight** (Parameter): the learnable weights of this layer.

        **bias** (Parameter or None): the learnable bias of this layer.

    Returns:
        None

    Examples:
        .. code-block:: python

          from paddle.fluid.dygraph.base import to_variable
          import paddle.fluid as fluid
          from paddle.fluid.dygraph import Linear
          import numpy as np

          data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
          with fluid.dygraph.guard():
              linear = Linear(32, 64)
              data = to_variable(data)
              res = linear(data)  # [30, 10, 64]
    Nr5   c                    sX   t t|   || _|| _| j||g||dd| _| j|g||dd| _t d | _	d S )NF)rM   rL   rN   rQ   Tr;   )
rR   r$   rS   rZ   rd   rf   rg   rh   r   r_   )rG   Z	input_dim
output_dimrm   rn   rp   rN   rr   rH   rI   rS     s   
zLinear.__init__c                 C   s*  t  r4t|jd}t|| j|ddddddd| j tj|| j	t
|jd | jd}tj|| j| jd	S t|d
g dd ddd| jd}|g| jgd}| j| j}| jjd|d|i|d | j	d ur| jj| jd}| jjd|g| j	gdd|git
|jd | jdd n|}| jj|| jdS )Nr   transpose_XFtranspose_Yalphar	   ry   r   rx   r   r   r$   )r   r   r   ry   r   matmulr   r   r   r   )r   r   rN   r    r   rg   r_   r   r   rh   lenrM   r   rZ   r   r   r   rd   r   r   )rG   r   r   r   r   r   tmppre_activationrH   rH   rI   r     sZ   


zLinear.forwardNNNr5   r   rH   rH   rr   rI   r$     s    6r$   c                       s2   e Zd ZdZ				d	 fdd	Zdd Z  ZS )
r)   a  
    This interface is used to construct a callable object of the ``InstanceNorm`` class.
    For more details, refer to code examples.

    Can be used as a normalizer function for convolution or fully_connected operations.
    The required data format for this layer is one of the following:

    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`

    Refer to `Instance Normalization: The Missing Ingredient for Fast Stylization <https://arxiv.org/pdf/1607.08022.pdf>`_
    for more details.

    :math:`input` is the input features over a mini-batch.

    ..  math::
        
        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift

    Note:
        `H` means height of feature map, `W` means width of feature map.

    Parameters:
        num_channels(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): A value added to the denominator for
            numerical stability. Default is 1e-5.
        param_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
	     If the Initializer of the param_attr is not set, the parameter is initialized 
	     one. If it is set to False, will not create param_attr. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
             If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
             If it is set to False, will not create bias_attr. Default: None.
        dtype(str, optional): Indicate the data type of the input ``Tensor``,
             which can be float32 or float64. Default: float32.

    Returns:
        None.

    Examples:

        .. code-block:: python

          import paddle.fluid as fluid
          from paddle.fluid.dygraph.base import to_variable
          import numpy as np
          import paddle

          # x's shape is [1, 3, 1, 2] 
          x = np.array([[[[1.0, 8.0]], [[10.0, 5.0]], [[4.0, 6.0]]]]).astype('float32')
          with fluid.dygraph.guard():
              x = to_variable(x)
              instanceNorm = paddle.nn.InstanceNorm(3)
              ret = instanceNorm(x)
              # ret's shape is [1, 3, 1, 2]; value is [-1 1 0.999999 -0.999999 -0.999995 0.999995] 
              print(ret)

    h㈵>Nr5   c                    s   t t|   |dks|dkr||ksJ d|| _|| _|| _|| _|dkrM|dkrM| j| j|g| jtddd| _	| j| j|g| jtddd| _
d S d | _	d | _
d S )NFzNparam_attr and bias_attr must be set to Fasle at the same time in InstanceNorm      ?)rL   rM   rN   rO   rQ   rB   T)rR   r)   rS   _epsilonrb   rc   rd   rf   r   scalerh   )rG   rj   epsilonrm   rn   rN   rr   rH   rI   rS   a  s.   
zInstanceNorm.__init__c           
      C   s   t  rt|| j| j| j}|S t r$t|| j| jd| j\}}}|S t|dddgd d| ji}| jrD| jrD|g| jg| jgd}nd|gi}| j	j
| jdd	}| j	j
| jdd	}| j	
| j}|g|g|gd
}	| j	jd||	|d |S )Nr   r   r5   r   r)   )r   ScaleBiasr   TrN   stop_gradient)r   	SavedMeanSavedVarianceinstance_normr   )r   r   r   r   rh   r   r   r    r   r   r   rd   r   )
rG   r   r   _r   r   
saved_meansaved_varianceZinstance_norm_outr   rH   rH   rI   r     sJ   



zInstanceNorm.forward)r   NNr5   r   rH   rH   rr   rI   r)     s    Er)   c                       sF   e Zd ZdZ														d fd	d
	Zdd Z  ZS )r%   aX  

    This interface is used to construct a callable object of the ``BatchNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Batch Normalization Layer and can be used 
    as a normalizer function for conv2d and fully connected operations.
    The data is normalized by the mean and variance of the channel based on the current batch data.
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.

    When use_global_stats = False, the :math:`\mu_{\beta}` 
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:

    ..  math::

        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
        //\ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
        //\ mini-batch\ variance \\

    - :math:`x` : mini-batch data
    - :math:`m` : the size of the mini-batch data

    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\

    The normalization function formula is as follows:
 
    ..  math::

        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift


    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter

    Parameters:
        num_channels(int): Indicate the number of channels of the input ``Tensor``.
        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
             Default: False.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
        dtype(str, optional): Indicate the data type of the input ``Tensor``,
             which can be float32 or float64. Default: float32.
        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
            average when model average is enabled. Default: True.
        use_global_stats(bool, optional): Whether to use global mean and
            variance. In inference or test mode, set use_global_stats to true
            or is_test to true, and the behavior is equivalent.
            In train mode, when setting use_global_stats True, the global mean
            and variance are also used during train period. Default: False.
        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
            Default: False.

    Returns:
        None

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          from paddle.fluid.dygraph.base import to_variable
          import numpy as np

          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
          with fluid.dygraph.guard():
              x = to_variable(x)
              batch_norm = fluid.BatchNorm(10)
              hidden1 = batch_norm(x)
    NF?r   r5   ru   Tc                    s@  t t|   || _|| _|| _t d | _|dusJ d|dkr&d| _n|| _|g}| j	| j|| jt
dd| _|oA| jjdk| j_| j	| j|| jd	d
| _|oW| jjdk| j_| j	t|t
dd|d|| jd| _d	| j_| j	t|t
dd|d|| jd| _d	| j_|
| _|	| _|| _|| _|| _d| _|| _|| _d S )Nr;   Fz,bias_attr should not be False in batch_norm.r   r5   r   rK   rB   TrP   )nameinitializerZ	trainableZdo_model_averagerL   rM   rN   )rR   r%   rS   rb   rc   rZ   r   r_   rd   rf   r   rg   Zlearning_rater   rh   r   _mean	_variance	_in_place_data_layout	_momentumr   _is_test_fuse_with_relu_use_global_stats_trainable_statistics)rG   rj   rp   is_testmomentumr   rm   rn   rN   data_layoutZin_placeZmoving_mean_nameZmoving_variance_nameZ!do_model_average_for_mean_and_varuse_global_statstrainable_statisticsparam_shaperr   rH   rI   rS     sd   
zBatchNorm.__init__c                 C   s  | j }| j}t ryt r7t|| j| j| j | j| j| j	| j
| j | j| jd\}}}}}}	tj|| j| jdS t rod| jd| j	d| j d| j
d| jd| jd	| jd
| jf}
tj|| j| j| j | jd ||g|
R  \}}	}	}	}	}	tj|| j| jdS t|dg dd | j| j	| j| j
d| j| j| jd}
|g| jg| jg| j g| jgd}| jj| jdd}| jj| jdd}| jj| j|dd}| jr|n| j| j}|g|g|g|g|gd}|d ur|g|d< | jjd|||
d | j|| jS )NF)rp   ry   r   r   r   r   ry   fuse_with_relur   r   r   r   r%   )r   r   r   r   ry   r   r   r   )r   r   r   MeanVarianceTr   )r   ZMeanOutZVarianceOutr   r   ZReserveSpace
batch_normr   )r   r   r   r   r   r   rg   rh   r   r   r   trainingr   r   r   r   rZ   r_   r   r   r    r   r   r   r   rd   input_dtyper   r   r   )rG   r   mean_outvariance_outZbatch_norm_outt1t2t3Zt4r   r   r   r   r   Zreserve_spacer   rH   rH   rI   r   R  s   





zBatchNorm.forward)NFr   r   NNr5   ru   FNNTFFr   rH   rH   rr   rI   r%     s$    cGr%   c                       2   e Zd ZdZ				d
 fdd	Zdd	 Z  ZS )r&   aA
  
   This interface is used to construct a callable object of the ``Dropout`` class.
   For more details, refer to code examples.

   Drop or keep each element of input independently. Dropout is a regularization
   technique for reducing overfitting by preventing neuron co-adaption during
   training. The dropout operator randomly sets (according to the given dropout
   probability) the outputs of some units to zero, while others are remain
   unchanged.

   Dropout layer can be removed for efficiency concern.

   Parameters:
       p (float, optional): Probability of setting units to zero. Default: 0.5
       seed (int, optional): A Python integer used to create random seeds. If this
                   parameter is set to None, a random seed is used.
                   NOTE: If an integer seed is given, always the same output
                   units will be dropped. DO NOT use a fixed seed in training. Default: None.
       dropout_implementation(string, optional): ['downgrade_in_infer'(default)|'upscale_in_train']

                                       1. downgrade_in_infer(default), downgrade the outcome at inference

                                          - train: out = input * mask
                                          - inference: out = input * (1.0 - p)

                                          (mask is a tensor same shape with input, value is 0 or 1
                                          ratio of 0 is dropout_prob)
                                       2. upscale_in_train, upscale the outcome at training time

                                          - train: out = input * mask / ( 1.0 - p )
                                          - inference: out = input

                                          (mask is a tensor same shape with input, value is 0 or 1
                                          ratio of 0 is p)
       is_test (bool, optional): A flag indicating whether it is in test phrase or not.
                   This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
                   Default: False.

   Returns:
       None

   Examples:

       .. code-block:: python

           import paddle.fluid as fluid
           from paddle.fluid.dygraph.base import to_variable
           import numpy as np

           x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
           with fluid.dygraph.guard():
               x = to_variable(x)
               m = fluid.dygraph.Dropout(p=0.5)
               droped_train = m(x)
               # switch to eval mode
               m.eval()
               droped_eval = m(x)
   rA   Ndowngrade_in_inferFc                    s   t t|   t|ttfsJ dd|  krdks$J d J d|| _|d u s4t|ts4J d|| _|dv s?J d|| _|| _	d S )Nzp argument should be a numberr   r	   z!p argument should between 0 and 1z)seed argument should be None or a integer)r   Zupscale_in_trainzTdropout_implementation argument should be 'downgrade_in_infer' or 'upscale_in_train')
rR   r&   rS   r[   floatint_dropout_prob_seed_dropout_implementationr   )rG   pseeddropout_implementationr   rr   rH   rI   rS     s   $


zDropout.__init__c                 C   s   | j dkr|S t }| jd u s| jdkr|jdkr|j| _| j t r&| j n| j| jd u| jd ur4| jnd| jd}t rQt|	 d}t
j|g|R  \}}|S | jj|jd}| jjtjjjdd}| jjdd|gi|g|gd	|d
 |S )Nr   )Zdropout_probr   Zfix_seedr   r   rH   r   Tr   dropoutr   )r   Maskr   )r   r   r   Zrandom_seedr   r   r   r   sumitemsr    r  r   r   rN   r   ZVarDescZVarTypeZUINT8r   )rG   r   progr   r   maskrH   rH   rI   r     s6   
	
zDropout.forward)rA   Nr   Fr   rH   rH   rr   rI   r&     s    <r&   c                       s4   e Zd ZdZ					d	 fdd	Zdd Z  ZS )
r'   a  
    :alias_main: paddle.nn.Embedding
	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
	:old_api: paddle.fluid.dygraph.Embedding

    **Embedding Layer**

    This interface is used to construct a callable object of the ``Embedding`` class.
    For specific usage, refer to code examples. It implements the function of the Embedding Layer.
    This layer is used to lookup embeddings vector of ids provided by :attr:`input` .
    It automatically constructs a 2D embedding matrix based on the
    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .

    The shape of output Tensor is generated by appending an emb_size dimension to the
    last dimension of the input Tensor shape.

    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
    otherwise the program will throw an exception and exit.

    .. code-block:: text

        Case 1:

        input is a Tensor. padding_idx = -1
            input.data = [[1, 3], [2, 4], [4, 127]
            input.shape = [3, 2]
        Given size = [128, 16]
        output is a Tensor:
            out.shape = [3, 2, 16]
            out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
                        [0.345421456, 0.524563927, ..., 0.144534654]],

                        [[0.345249859, 0.124939536, ..., 0.194353745],
                        [0.945345345, 0.435394634, ..., 0.435345365]],
                        
                        [[0.945345345, 0.435394634, ..., 0.435345365],
                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
        The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
        It will pad all-zero data when ids is 127.

    Parameters:
        size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size
            of the dictionary of embeddings and the size of each embedding vector respectively.
        is_sparse(bool): The flag indicating whether to use sparse update. This parameter only
            affects the performance of the backwards gradient update. It is recommended to set 
            True because sparse update is faster. But some optimizer does not support sparse update,
            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` , 
            :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` ,
            :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` .
            In these case, is_sparse must be False. Default: False.
        is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used
            in multi-machine distributed CPU training. Default: False.
        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size). 
            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
            If set None, it makes no effect to output. Default: None.
        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
            The local word vector needs to be transformed into numpy format, and the shape of local word
            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
            is used to load custom or pre-trained word vectors. See code example 2 for details.
        dtype(np.dtype|core.VarDesc.VarType|str): It refers to the data type of output Tensor.
            It must be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of this layer.

    Returns:
        Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .

    Examples:

        .. code-block:: python

          import paddle.fluid as fluid
          import paddle.fluid.dygraph.base as base
          import numpy as np

          # example 1
          inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
          inp_word.shape  # [2, 3]
          dict_size = 20
          with fluid.dygraph.guard():
              emb = fluid.dygraph.Embedding(
                  size=[dict_size, 32],
                  param_attr='emb.w',
                  is_sparse=False)
              static_rlt3 = emb(base.to_variable(inp_word))
              static_rlt3.shape  # [2, 3, 32]

          # example 2: load custom or pre-trained word vectors
          weight_data = np.random.random(size=(128, 100))  # word vectors with numpy format
          w_param_attrs = fluid.ParamAttr(
              name="emb_weight",
              learning_rate=0.5,
              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
              trainable=True)
          with fluid.dygraph.guard():
              emb = fluid.dygraph.Embedding(
                  size=[128, 100],
                  param_attr= w_param_attrs,
                  is_sparse=False)
              static_rlt3 = emb(base.to_variable(inp_word))          
    FNr5   c                    s   t t|   || _|| _|| _|d u rdn|dkr|n|d | | _|| _|| _| jo/| j | _	| j	r@| jdu r>| jdu s@J | j
| j| j| jdd| _d S )Nrv   r   TFrP   )rR   r'   rS   _size
_is_sparse_is_distributed_padding_idxrb   rd   _remote_prefetchrf   rg   )rG   size	is_sparseis_distributedpadding_idxrm   rN   rr   rH   rI   rS     s"   
zEmbedding.__init__c                 C   s   t  rt| j|d| jd| jd| jd| j
S t|dg dd | j| j| j| jd}| j	
| j}| j	jd	|| jd
d|i|d |S )Nr  r  remote_prefetchr  r   )r   r   Zint16int32int64r'   )r  r  r  r  lookup_table_v2)ZIdsWr   r   )r   r    r  rg   r  r	  r  r
  r   r   r   rd   r   )rG   r   r   r   rH   rH   rI   r     s0   
zEmbedding.forward)FFNNr5   r   rH   rH   rr   rI   r'     s    mr'   c                       s8   e Zd ZdZ							d
 fdd	Zdd	 Z  ZS )r*   a  
    :alias_main: paddle.nn.LayerNorm
	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
	:old_api: paddle.fluid.dygraph.LayerNorm

    This interface is used to construct a callable object of the ``LayerNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_

    The formula is as follows:

    ..  math::

        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i

        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}

        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)

    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
    - :math:`H`: the number of hidden units in a layers
    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
    - :math:`g`: the trainable scale parameter.
    - :math:`b`: the trainable bias parameter.

    Parameters:
        normalized_shape(int or list or tuple): Input shape from an expected input of
            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
            If it is a single integer, this module will normalize over the last dimension
            which is expected to be of that specific size.
        scale(bool, optional): Whether to learn the adaptive gain :math:`g` after
            normalization. Default: True.
        shift(bool, optional): Whether to learn the adaptive bias :math:`b` after
            normalization. Default: True.
        epsilon(float, optional): The small value added to the variance to prevent
            division by zero. Default: 1e-05.
        param_attr(ParamAttr, optional): The parameter attribute for the learnable
            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
            a default :code:`ParamAttr` would be added as scale. The
            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
            a default :code:`ParamAttr` would be added as bias. The
            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
        act(str, optional): Activation to be applied to the output of layer normalization.
                  Default: None.
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Returns:
        None

    Examples:

        .. code-block:: python

          import paddle.fluid as fluid
          from paddle.fluid.dygraph.base import to_variable
          import numpy

          x = numpy.random.random((3, 32, 32)).astype('float32')
          with fluid.dygraph.guard():
              x = to_variable(x)
              layerNorm = fluid.LayerNorm([32, 32])
              ret = layerNorm(x)

    Tr   Nr5   c	           
         s   t t|   t|tjr|g}t|| _|| _|| _	|| _
|| _|| _|| _|| _t| jg}	| jrC| j| j|	| jtdd| _n| jrKtd d | _| j	rf| jdusXJ | j| j|	| jdd| _d S | jrntd d | _d S )Nr   rK   z0param_attr are only available with scale is TrueFTrP   z/bias_attr are only available with shift is True)rR   r*   rS   r[   numbersIntegrallist_normalized_shape_scale_shiftr   rb   rc   rZ   rd   npprodrf   r   rg   loggingwarnrh   )
rG   Znormalized_shaper   shiftr   rm   rn   rp   rN   r   rr   rH   rI   rS     s@   	




zLayerNorm.__init__c              	   C   s  t |j}t|}t| j}|| | _||k s!|| jd  | jkr:t| j}td| d |dd   d t| t rst rYt	
|| j| j| j| jd\}}}tj|| jdS t
|| j| jd| jd| j\}}}tj|| jdS t|d	d
dgd t }|g|d< | jr| jg|d< | jr| jg|d< | j| jd}	| jj| jdd}
| jj| jdd}| j| j}| jjd|||
|d| j| jdd | jj|| jdS )NzGiven normalized_shape is z , expected input with shape [*, r	   z, but got input shape Fr   r   begin_norm_axisr   r5   r   r*   r   r   r   )r   r   Tr   
layer_normr   r   r   r   )r  rM   r   r  Z_begin_norm_axisr   r]   r   r   r   r!  rg   rh   r   r   r   rZ   r    r   dictr  r  r   r   rd   r   r   )rG   r   input_shapeZ
input_ndimZnormalized_ndimZstr_normalized_shaper   r   r   r   r   r   Zlayer_norm_outrH   rH   rI   r   *  s   









zLayerNorm.forward)TTr   NNNr5   r   rH   rH   rr   rI   r*     s    H,r*   c                       s6   e Zd ZdZ						d fdd	Zd	d
 Z  ZS )r(   at  
    **GRU unit layer**
    
    It creates a callable object from GRUUnit class.
    If origin_mode is True, then the equation of a gru step is from paper
    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical 
    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_

        .. math::
            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)

            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)

            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)

            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)

    If origin_mode is False, then the equation of a gru step is from paper
    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_

        .. math::
            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)

            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)

            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)

            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)


    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
    of the equation above, the :math:`z_t` is split into 3 parts -
    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
    implement a full GRU unit operator for an input, a fully
    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.

    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.

    Parameters:
        size (int): The input dimension value.
        param_attr(ParamAttr, optional): The parameter attribute for the learnable
            hidden-hidden weight matrix. 
            
            **Note**:
    
                1. The shape of the weight matrix is :math:`[T, 3*D]`, where D is the hidden size.
                2. All elements in the weight matrix can be divided into two parts. The first 
                   part are weights of the update gate and reset gate with shape :math:`[D, 2*D]`, 
                   and the second part are weights for candidate hidden state with shape :math:`[D, D]`.


            If it is set to None or one attribute of ParamAttr, gru_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. The default 
            value is None.
        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias
            of GRU.Note that the bias with :math:`[1, 3*D]` concatenates
            the bias in the update gate, reset gate and candidate calculations.
            If it is set to False, no bias will be applied to the update gate,
            reset gate and candidate calculations. If it is set to None or one
            attribute of ParamAttr, gru_unit will create ParamAttr as
            bias_attr. If the Initializer of the bias_attr is not set, the bias
            is initialized zero. The default value is None.
        activation (str): The activation type for cell (actNode).
                             The default value is 'tanh'.
        gate_activation (str): The activation type for gates (actGate).
                                  The default value is 'sigmoid'.
        dtype(str): The dtype of the layers. The data type can be set as
            'float32', 'float64'. The default value is 'float32'.

    Attribute:
        **weight** (Parameter): the learnable weights of this layer.

        **bias** (Parameter): the learnable bias of this layer.

    Returns:
        tuple: The hidden value, reset-hidden value and gate values. The hidden value
        is a 2-D tensor with shape  :math:`[T, D]` . The reset-hidden value is a
        2-D tensor with shape  :math:`[T, D]` . The gate value is a 2-D tensor with 
        shape  :math:`[T, 3*D]`.

    Examples:

        .. code-block:: python

          import paddle.fluid as fluid
          import paddle.fluid.dygraph.base as base
          import numpy

          lod = [[2, 4, 3]]
          D = 5
          T = sum(lod[0])

          input = numpy.random.rand(T, 3 * D).astype('float32')
          hidden_input = numpy.random.rand(T, D).astype('float32')
          with fluid.dygraph.guard():
              x = numpy.random.random((3, 32, 32)).astype('float32')
              gru = fluid.dygraph.GRUUnit(size=D * 3)
              dy_ret = gru(
                base.to_variable(input), base.to_variable(hidden_input))

    NtanhsigmoidFr5   c           
         s   t t|   || _tddddd}|| | _|| | _|| _|d }| j||d| g|d| _	dd| g}	|	| _
| j||	|dd| _d S )	Nr   r	   r   r   )identityr&  r%  Zrelur   TrP   )rR   r(   rS   rc   r#  
activationgate_activationrd   rf   rg   Z
_bias_sizerh   )
rG   r  rm   rn   r(  r)  Zorigin_moderN   Zactivation_dict	bias_sizerr   rH   rI   rS     s.   


zGRUUnit.__init__c              
   C   s   t  rt||| j| jd| jd| j\}}}|||fS t|dddgd t|dddgd |g|g| jgd}| jd urB| jg|d	< | j	| j
}| j	| j
}| j	| j
}| jjd
||||d| j| jdd |||fS )Nr(  r)  r   r5   r   r(   hidden)r~   Z
HiddenPrevWeightr   gru_unit)ZGateZResetHiddenPrevZHidden)r(  r)  r   )r   r    r-  rg   rh   r(  r)  r   r   r   rd   r   )rG   r   r+  ZgateZreset_hidden_preZupdated_hiddenr   rH   rH   rI   r     sH   




zGRUUnit.forward)NNr%  r&  Fr5   r   rH   rH   rr   rI   r(   k  s    n"r(   c                       s>   e Zd ZdZ									d fdd	Zdd	d
Z  ZS )r+   a  
    This interface is used to construct a callable object of the ``NCE`` class.
    For more details, refer to code examples.
    It implements the function of the ``NCE`` loss function.
    By default this function uses a uniform distribution for sampling, and it
    compute and return the noise-contrastive estimation training loss. See
    `Noise-contrastive estimation: A new estimation principle for unnormalized statistical models <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_ .

    Parameters:
        num_total_classes (int): Total number of classes in all samples.
        dim (int): Dimension of input (possibly embedding dim).
        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of nce. If it is set to None or one attribute of ParamAttr, nce
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr or bool, optional): The attribute for the bias of nce.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, nce
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
        num_neg_samples (int, optional): The number of negative classes. The default value is 10.
        sampler (str, optional): The sampler used to sample class from negative classes.
                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
                       default: 'uniform'.
        custom_dist (float[], optional): A float[] with size=num_total_classes.
                       It is used when sampler is set to 'custom_dist'.
                       custom_dist[i] is the probability of i-th class to be sampled.
                       Default: None.
        seed (int, optional): The seed used in sampler. Default: 0.
        is_sparse(bool, optional): The flag indicating whether to use sparse update. If is_sparse is True, the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default: False.
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of this layer.

        **bias** (Parameter or None): the learnable bias of this layer.
    
    Returns:
        None

    Examples:
        .. code-block:: python

            import numpy as np
            import paddle.fluid as fluid

            window_size = 5
            dict_size = 20
            label_word = int(window_size // 2) + 1
            inp_word = np.array([[1], [2], [3], [4], [5]]).astype('int64')
            nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')

            with fluid.dygraph.guard():
                words = []
                for i in range(window_size):
                    words.append(fluid.dygraph.base.to_variable(inp_word[i]))

                emb = fluid.Embedding(
                    size=[dict_size, 32],
                    param_attr='emb.w',
                    is_sparse=False)

                embs3 = []
                for i in range(window_size):
                    if i == label_word:
                        continue

                    emb_rlt = emb(words[i])
                    embs3.append(emb_rlt)

                embs3 = fluid.layers.concat(input=embs3, axis=1)
                nce = fluid.NCE(
                             num_total_classes=dict_size,
                             dim=embs3.shape[1],
                             num_neg_samples=2,
                             sampler="custom_dist",
                             custom_dist=nid_freq_arr.tolist(),
                             seed=1,
                             param_attr='nce.w',
                             bias_attr='nce.b')

                wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
                nce_loss3 = nce(embs3, wl)

    Nuniformr   Fr5   c                    s  t t   | _| _| _| _t  _|d ur|ng  jd< |dkr*d}n|dkr2d}n|dkr9|d us=J t	|}dg| }dg| }g }g }t
|D ],}|| | }|d dkri|||f qSd| dkrw|||f qS|||< d||< qSt	|rt	|r|d}|d}|d }|d }|d ||d < |||d < |d |d  d }|d dkr|||f nd| dkr|||f n|||< d||< t	|rt	|st	|r|d}d||d < d||d < t	|r	|d}d||d < d||d <  fd	d
}|t|d jd< |t|d jd< |t|d jd< d}ntd|d u rEd}nt|}| _|
}td t|||	||
|d _ j j j|gd jd _ jr j j jdgd jd _ j jd<  j jd< d S )NSampleWeightr.  r   Zlog_uniformr	   custom_distr   rv   c                    s&    j t | j| jt| d}d|_|S )NrK   T)rf   r   rM   rN   r   r   )Znumpy_arrayretrG   rH   rI   _init_by_numpy_array  s   z*NCE.__init__.<locals>._init_by_numpy_arrayr5   CustomDistProbsr  CustomDistAliasCustomDistAliasProbsr   zUnsupported sampler type.
   zWWith sparse mode, if your models has only small parameter prefetch may cause speed down)num_total_classesnum_neg_samplesr   samplerr  r  F)rL   rM   rQ   rN   Tr   r,  )rR   r+   rS   rb   rc   Z_num_total_classesrd   r#  _inputsr   rangeappendpopr  arrayZastype	Exceptionr   _num_neg_samplesprint_attrsrf   rg   rh   )rG   r8  dimsample_weightrm   rn   r9  r:  r0  r   r  rN   Zcustom_dist_lenZalias_probs_Zalias_ZbigsZlittlesiZnormal_probbiglittleZbig_idxZbig_probZbig_leftr3  r  rr   r2  rI   rS   w  s   








	



	zNCE.__init__c           	      C   sr  t  rKd| jd d| jd d| jd d| jd d| jd d| jd f}tj||| j| j| jd | jd | jd	 | jd
 g|R  \}}}|| jd  S t|dddgd t|ddgd t	|dt
td fd t|t
snJ t|t
suJ || jd< || jd< |d ur|ng | jd< | jj|jd}| jj|jd}| jj|jd}| jjd| j|||d| jd || jd  S )Nr8  r9  r   r:  r  r  r/  r4  r5  r6  r	   r   r5   r   r+   labelr  rE  r~   Labelr   nce)ZCostZSampleLogitsZSampleLabelsr   )r   rC  r    rK  rg   rh   r;  rA  r   r   r   r   r[   r   r   rN   r   )	rG   r   rI  rE  r   Zcostr   Zsample_logitsZsample_labelsrH   rH   rI   r     sX   


zNCE.forward)	NNNNr.  Nr   Fr5   Nr   rH   rH   rr   rI   r+      s    Yxr+   c                       r   )	r,   ao  
    This interface is used to construct a callable object of the ``PRelu`` class.
    For more details, refer to code examples.
    It implements three activation methods of the ``PRelu`` activation function.

    Equation:

    .. math::
        y = \max(0, x) + \\alpha * \min(0, x)

    Parameters:
        mode (str): The mode for weight sharing. It supports all, channel
          and element. all: all elements share same weight
          channel:elements in a channel share same weight
          element:each element has a weight
        channel (int, optional): The number of channels.
          This argument is required when mode is "channel".
          Default: None.
        input_shape (list or tuple, optional): The shape of input.
          This argument is required when mode is "element".
          Default: None.
        param_attr(ParamAttr, optional): The parameter attribute for the learnable
          weight (alpha). Default: None.
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of this layer.
    
    Returns:
        None

    Examples:

        .. code-block:: python

          import paddle.fluid as fluid
          from paddle.fluid.dygraph.base import to_variable
          import numpy as np

          inp_np = np.ones([5, 200, 100, 100]).astype('float32')
          with fluid.dygraph.guard():
              inp_np = to_variable(inp_np)
              prelu0 = fluid.PRelu(
                 mode='all',
                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
              dy_rlt0 = prelu0(inp_np)
              prelu1 = fluid.PRelu(
                 mode='channel',
                 channel=200,
                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
              dy_rlt1 = prelu1(inp_np)
              prelu2 = fluid.PRelu(
                 mode='element',
                 input_shape=inp_np.shape,
                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
              dy_rlt2 = prelu2(inp_np)

    Nr5   c                    s   t t| jdd || _|| _|| _|dkrdg| _n5|dkr0t|ts(J dd|ddg| _n |dkrLt|t	t
fs?J ddgt	|dd   | _ntd	| j| j| jd
dtdd| _d S )Nprelu)
name_scopeallr	   channelz4channel argument is required when mode is 'channel'.elementz8input_shape argument is required when mode is 'element'.z,mode should be one of all, channel, element.r5   Fr   )rL   rM   rN   rQ   rO   )rR   r,   rS   _moderb   rd   Z_alpha_shaper[   r   r  tupler]   rf   r   rg   )rG   moderP  r$  rm   rN   rr   rH   rI   rS   W	  s:   
zPRelu.__init__c                 C   sb   t  rt|| jd| jS t|ddgd | j| j}| jj	d|| jdd| jid|id	 |S )
Nru   r   r5   r,   rM  )r   AlpharT  r   )r   r   r   r   )
r   r   rM  rg   rR  r   r   r   rd   r   rG   r   r   rH   rH   rI   r   {	  s   zPRelu.forwardr   r   rH   rH   rr   rI   r,   	  s    =$r,   c                       sB   e Zd ZdZ					d fdd	Zedddd	d
d Z  ZS )r-   a  

    **Add Bilinear Tensor Product Layer**

    This layer performs bilinear tensor product on two inputs.
    For example:

    .. math::
      out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1

    In this formula:
     - :math:`x`: the first input contains M elements, shape is [batch_size, M].
     - :math:`y`: the second input contains N elements, shape is [batch_size, N].
     - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
     - :math:`y^\mathrm{T}`: the transpose of :math:`y`.

    Parameters:
       input1_dim (int): The dimension of each first input.
       input2_dim (int): The dimension of each second input.
       output_dim (int): The dimension of output of this layer.
       name (str, optional): The default value is None. Normally there is no need for user
           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
       act (str, optional): Activation to be applied to the output of this layer. The default value is None.
       param_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of 
           this layer. The default value is None.
       bias_attr (ParamAttr, optional): The parameter attribute for the bias
           of this layer. If it is set to False, no bias will be added to the output units.
           If it is set to None, the bias is initialized zero. The default value is None.
       dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of this layer.

        **bias** (Parameter): the learnable bias of this layer.

    Returns:
       Tensor: A 2-D Tensor of shape [batch_size, size].

    Examples:
       .. code-block:: python

        import paddle
        import numpy

        layer1 = numpy.random.random((5, 5)).astype('float32')
        layer2 = numpy.random.random((5, 4)).astype('float32')
        bilinearTensorProduct = paddle.nn.BilinearTensorProduct(
            input1_dim=5, input2_dim=4, output_dim=1000)
        ret = bilinearTensorProduct(paddle.to_tensor(layer1),
                                    paddle.to_tensor(layer2))

    Nr5   c	                    s   t t|   || _|| _|| _|| _|| _|| _|| _	t
 | _|| _| j	| j| jg}	| j| j|	| jdd| _d| j	g}
| j| j|
| jdd| _d S )NFrP   r	   T)rR   r-   rS   rb   rc   rZ   _nameZ_input1_dimZ_input2_dimZ_output_dimr#  r;  rd   rf   rg   rh   )rG   Z
input1_dimZ
input2_dimr   r   rp   rm   rn   rN   r   r*  rr   rH   rI   rS   	  s,   	
zBilinearTensorProduct.__init__z2.0.0zpaddle.nn.Bilinearz1New name and new args in Bilinear, easier to use.)ZsinceZ	update_toreasonc                 C   s   t |dddgd t |dddgd ||| jd| _| jd ur%| j| jd< | jd ur=| jjd|  | jg| j	d	d
}n	| jj| j	d	d}| jj
d| jd|id | jj|| jdS )Nxr5   r   r-   y)r   r   r,  r   .Fr   rN   persistable)rN   r]  Zbilinear_tensor_productr   r   r   r   r   )r   rg   r;  rh   rW  r   create_variablejoinZ	full_namerd   r   r   rZ   )rG   rY  rZ  r   rH   rH   rI   r   	  s0   



zBilinearTensorProduct.forward)NNNNr5   )r   r   r   r   rS   
deprecatedr   r   rH   rH   rr   rI   r-   	  s    :r-   c                       s>   e Zd ZdZ										d fdd	Zd	d
 Z  ZS )r.   a  
    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
    For more details, refer to code examples.
    The convolution2D transpose layer calculates the output based on the input,
    filter, and dilations, strides, paddings. Input and output
    are in NCHW format. Where N is batch size, C is the number of feature map,
    H is the height of the feature map, and W is the width of the feature map.
    Filter's shape is [MCHW] , where M is the number of input feature map,
    C is the number of output feature map, H is the height of the filter,
    and W is the width of the filter. If the groups is greater than 1,
    C will equal the number of input feature map divided by the groups.
    If bias attribution and activation type are provided, bias is added to
    the output of the convolution, and the corresponding activation function
    is applied to the final result.
    The details of convolution transpose layer, please refer to the following explanation and references
    `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .

    For each input :math:`X`, the equation is:

    .. math::

        Out = \sigma (W \\ast X + b)

    Where:

    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.

    Example:

        - Input:

          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`

          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`

        - Output:

          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`

        Where

        .. math::

           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )

    Parameters:
        num_channels(int): The number of channels in the input image.
        num_filters(int): The number of the filter. It is as same as the output
            feature map.
        filter_size(int or tuple): The filter size. If filter_size is a tuple,
            it must contain two integers, (filter_size_H, filter_size_W).
            Otherwise, the filter will be a square.
        output_size(int or tuple, optional): The output image size. If output size is a
            tuple, it must contain two integers, (image_H, image_W). None if use
            filter_size, padding, and stride to calculate output_size.
            if output_size and filter_size are specified at the same time, They
            should follow the formula above. Default: None.
        padding(int or tuple, optional): The padding size. If padding is a tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: 0.
        stride(int or tuple, optional): The stride size. If stride is a tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. Default: 1.
        dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: 1.
        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
            when group=2, the first half of the filters is only connected to the
            first half of the input channels, while the second half of the
            filters is only connected to the second half of the input channels.
            Default: 1.
        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d_transpose.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv2d_transpose
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True.
        act (str, optional): Activation type, if it is set to None, activation is not appended.
            Default: None.
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of filters of this layer.

        **bias** (Parameter or None): the learnable bias of this layer.

    Returns:
        None

    Examples:
       .. code-block:: python

          import paddle.fluid as fluid
          import numpy as np

          with fluid.dygraph.guard():
              data = np.random.random((3, 32, 32, 5)).astype('float32')
              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
                    num_channels=32, num_filters=2, filter_size=3)
              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))

    Nr   r	   Tr5   c                    sF  t t|   |	dusJ d|	| _|
| _|| _|| _|| _|| _|| _	|| _
|| _|| _|| _|| _|| _| j| jkrI| j| jkrI| j	sId| _nd| _t| j
dd| _
t| jdd| _t| jdd| _t| jdd	| _| jd u ryg | _not| jtrt| jrt| j| _n[t| jdd
| _nQt| jtrt| jdd
| _nAt| jtrt| jjd
ddgd t| jjdkr| jjd dks| jjd dkr| jjd dkr| j| jg| _ntdtdt| j
dd| _
| jd u rdn| j| _| j| j| j g| j }| j| j|| jd| _| j| j| jg| jdd| _ d S )NFz3param_attr should not be False in conv2d_transpose.Zdepthwise_conv2d_transposeZconv2d_transposer   r8   r7   r9   zconv2d_transpose.filter_sizeoutput_sizer  r  r.   r	   r   z-output_size must contain one or two integers.z+output_size should be list or int or Tensorr   TrP   )!rR   r.   rS   rb   rc   rZ   rU   rD   ra   r^   rX   rW   rY   r`   _output_sizerd   _op_typer   rV   r[   r  Z_contain_varZ_convert_to_tensor_listr   r   r   rN   r   rM   r]   rf   rg   rh   )rG   rj   rk   r?   rb  r8   r7   r9   rl   rm   rn   ro   rp   rN   rq   rr   rH   rI   rS   r
  s   


zConv2DTranspose.__init__c                 C   s0  t  r3tt| j}||| jd| jd| jd| jd| jd| j	d| j
}|}t|| jd}tj|| jdS t|d	g d
d |g| jgd}| j| j| j| j| j	| j
d}| jj|jd}| jj| j|d|i|d | jd ur| jj| jd}| jjd|g| jgdd|giddid n|}| jj|| jd}|S )Nrb  rz   r{   r|   rl   ro   r	   r   r   r   r.   r}   )rb  rz   r{   r|   rl   ro   r   r   r   r   r   r   rw   )r   getattrr    rd  rg   rc  rW   rX   rY   rU   r^   r   r   rh   r   rZ   r   r   r   rN   r   rd   r   )rG   r   opr   r   r   r   r   rH   rH   rI   r   
  s`   	

zConv2DTranspose.forward)
Nr   r	   r	   NNNTNr5   r   rH   rH   rr   rI   r.   	  s    ySr.   c                       s>   e Zd ZdZ						d fdd	Zdd Zd	d
 Z  ZS )SequenceConva:  
    This function creates the op for sequence_conv, using the inputs and
    other convolutional configurations for the filters and stride as given
    in the input parameters to the function.

    Parameters:
        name_scope(str): The name of this class.
        num_filters (int): number of filters.
        filter_size (int): the filter size (H and W). Default: 3.
        filter_stride (int): stride of the filter. Default: 1.
        padding (bool|None): if True, add paddings. Default: None
        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, sequence_conv
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
            of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        act (str): Activation type, if it is set to None, activation is not appended.
            Default: None.

    Attributes:
        weight (Parameter): the learnable weights of filters of this layer.
        bias (Parameter|None): the learnable bias of this layer.

    Returns:
        Variable: output of sequence_conv
    r   r	   Nc	           	         sL   t  rJ dtt| | || _|| _|| _|| _|| _|| _	|| _
d S )Nz8SequenceConv is not supported by dynamic graph mode yet!)r   rR   rg  rS   ra   r`   _filter_striderX   rc   rb   rZ   )	rG   rN  rk   r?   Zfilter_strider8   rn   rm   rp   rr   rH   rI   rS     s   	
zSequenceConv.__init__c                 C   sZ   | j || _| j|jd  | jg}| j| j|| jd| _| j| j	| jg| jdd| _
d S )Nr	   r   TrP   )r   r   rd   r`   rM   ra   rf   rb   rg   rc   rh   rG   r   rq   rH   rH   rI   _build_once,  s   zSequenceConv._build_oncec                 C   s   | j | j}| j jd|g| jgdd|i| jt| jd  | jdd | jd urH| j j| jd}| j jd|g| jgd	d|gid
did n|}| j j	|| j
dS )NZsequence_convr   r   r   r   )ZcontextStrideZcontextStartZcontextLengthr   r   r   r   rw   r	   r   )r   r   rd   r   rg   rh  r   r`   rh   r   rZ   r   rH   rH   rI   r   8  s0   
zSequenceConv.forward)r   r	   NNNNr   r   r   r   rS   rj  r   r   rH   rH   rr   rI   rg  
  s    "rg  c                       s6   e Zd ZdZ		d	 fdd	Zdd Zdd Z  ZS )
RowConva  
    ***Row-convolution operator***

    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf

    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
    forward and a backward pass through the entire sequence. However, unlike
    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
    and low-latency setting. The lookahead convolution incorporates information
    from future subsequences in a computationally efficient manner to improve
    unidirectional recurrent neural networks. The row convolution operator is
    different from the 1D sequence convolution, and is computed as follows:

    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.

    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .

    Parameters:
        name_scope(str): The name of this class.
        future_context_size (int): Future context size. Please note, the shape
            of convolution kernel is [future_context_size + 1, D].
        param_attr (ParamAttr): Attributes of parameters, including
            name, initializer etc. Default: None.
        act (str): Non-linear activation to be applied to output variable. Default: None.

    Attributes:
        weight (Parameter): the learnable weights of this layer.

    Returns:
        the output(Out) is a LodTensor, which supports variable time-length input sequences.
        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          with fluid.dygraph.guard():
              x = numpy.random.random((16)).astype('float32')
              rowConv = fluid.dygraph.nn.RowConv(
                    'RowConv', future_context_size=2)
              ret = rowConv(fluid.dygraph.base.to_variable(x))

    Nc                    s4   t  rJ dtt| | || _|| _|| _d S )Nz3RowConv is not supported by dynamic graph mode yet!)r   rR   rm  rS   rZ   rb   _future_context_size)rG   rN  Zfuture_context_sizerm   rp   rr   rH   rI   rS     s   
zRowConv.__init__c                 C   s>   | j || _| jd |jd g}| j| j|| jdd| _d S )Nr	   FrP   )r   r   rd   rn  rM   rf   rb   rg   ri  rH   rH   rI   rj    s   zRowConv._build_oncec                 C   sD   | j | j}| j jd|g| jgdd|gid | j j|| jdS )NZrow_convrk  r   r^  r   )r   r   rd   r   rg   r   rZ   rV  rH   rH   rI   r     s   zRowConv.forward)NNrl  rH   rH   rr   rI   rm  V  s    2rm  c                       s6   e Zd ZdZ						d
 fdd	Zdd	 Z  ZS )r0   a  
    :alias_main: paddle.nn.GroupNorm
	:alias: paddle.nn.GroupNorm,paddle.nn.layer.GroupNorm,paddle.nn.layer.norm.GroupNorm
	:old_api: paddle.fluid.dygraph.GroupNorm

    This interface is used to construct a callable object of the ``GroupNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Group Normalization Layer.
    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .

    Parameters:
        channels(int): The number of channels of input.
        groups(int): The number of groups that divided from channels.
        epsilon(float, optional): The small value added to the variance to prevent
                                  division by zero. Default: 1e-05.
        param_attr(ParamAttr, optional): The parameter attribute for the learnable
                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
                                         If it is set to None, the bias is initialized one. Default: None.
        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
                                        If it is set to None, the bias is initialized zero. Default: None.
        act(str, optional): Activation to be applied to the output of group normalization. Default: None.
        data_layout(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.

    Returns:
        None

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy as np

          with fluid.dygraph.guard():
              x = np.random.random((8, 32, 32)).astype('float32')
              groupNorm = fluid.dygraph.nn.GroupNorm(channels=32, groups=4)
              ret = groupNorm(fluid.dygraph.base.to_variable(x))

    r   Nru   r5   c	           
         s   t t|   || _|| _|| _|| _|| _|| _|| _	|dkr&t
d| | jg}	| j| jp0d|	| j	tdd| _| j| jp@d|	| j	dd| _d S )Nru   zunsupported data layout:Fr   rK   TrP   )rR   r0   rS   rb   rc   r   Z	_channelsrU   rZ   rd   r]   rf   r   rg   rh   )
rG   Zchannelsrl   r   rm   rn   rp   r   rN   r   rr   rH   rI   rS     s*   	zGroupNorm.__init__c           	      C   s  | j j| jdd}| j j| jdd}t r*t|| j| j| j| j	d}t
|| jS t rNd| jd| j	f}tj|| j| j||g|R  \}}}t
|| jS d|i}| jd ur\| j|d< | jd urf| j|d< | j j| jd	}| j jd
||||d| j| j	dd | j || jS )NTr   ru   r   rl   r   r   r   r   
group_normr"  )r   rl   r   )r   r   rd   r   r   ro  rg   rh   r   rU   r   r   rZ   r   r    r   r   )	rG   r   r   r   r   r   r   r   Zgroup_norm_outrH   rH   rI   r     sL   




zGroupNorm.forward)r   NNNru   r5   r   rH   rH   rr   rI   r0     s    + r0   c                       r   )r1   a~	  
    This interface is used to construct a callable object of the ``SpectralNorm`` class.
    For more details, refer to code examples. It implements the function of the Spectral Normalization Layer.
    This layer calculates the spectral normalization value of weight parameters of
    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
    Parameters. Calculations are showed as follows.

    Step 1:
    Generate vector U in shape of [H], and V in shape of [W].
    While H is the :attr:`dim` th dimension of the input weights,
    and W is the product result of remaining dimensions.

    Step 2:
    :attr:`power_iters` should be a positive integer, do following
    calculations with U and V for :attr:`power_iters` rounds.

    .. math::

        \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}

        \mathbf{u} := \frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}

    Step 3:
    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.

    .. math::

        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}

        \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})}


    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .

    Parameters:
        weight_shape(list or tuple): The shape of weight parameter.
        dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: 0.
        power_iters(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
        eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Returns:
        None

    Examples:
       .. code-block:: python

            import paddle
            x = paddle.rand((2,8,32,32))

            spectral_norm = paddle.nn.SpectralNorm(x.shape, dim=1, power_iters=2)
            spectral_norm_out = spectral_norm(x)

            print(spectral_norm_out.shape) # [2, 8, 32, 32]

    r   r	   -q=r5   c                    s   t t|   || _|| _|| _|| _t|| _t	
| jdks$J d|t| jk s2J d|| j| j }t	
| j| }| jt |g| jtddd| _d| j_| jt |g| jtddd| _d| j_d S )Nr   z5Any dimension of `weight_shape` cannot be equal to 0.zUThe input `dim` should be less than the length of `weight_shape`, but received dim={}rB   r   rK   T)rR   r1   rS   _power_iters_eps_dimrd   r  Z_weight_shaper  r  r   formatrf   r   r   weight_ur   weight_v)rG   Zweight_shaperD  power_itersepsrN   hwrr   rH   rI   rS   T  s>   
zSpectralNorm.__init__c                 C   s~   t  rt|| j| j| j| j| jS t|dddgd || j| jd}| j	
| j}| j	jd|d|i| j| j| jdd	 |S )
Nrg   r5   r   r1   )r,  UVspectral_normr   )rD  rw  rx  r   )r   r   r}  ru  rv  rs  rq  rr  r   r   r   rd   r   )rG   rg   r   r   rH   rH   rI   r   x  s&   zSpectralNorm.forward)r   r	   rp  r5   r   rH   rH   rr   rI   r1     s    <$r1   c                       s8   e Zd ZdZ							d fdd	Zd	d
 Z  ZS )r2   a  
    This interface is used to construct a callable object of the ``TreeConv`` class.
    For more details, refer to code examples.
    Tree-Based Convolution is a kind of convolution based on tree structure.
    Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
    which is used to classify tree structures, such as Abstract Syntax Tree.
    Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
    which regards multiway tree as binary tree.
    The paper of Tree-Based Convolution Operator is here: `tree-based convolution <https://arxiv.org/abs/1409.5718v1/>`_ .
    
    Parameters:
        feature_size(int): last dimension of nodes_vector.
        output_size(int): output feature width.
        num_filters(int, optional): number of filters, Default: 1.
        max_depth(int, optional): max depth of filters, Default: 2.
        act(str, optional): activation function, Default: tanh.
        param_attr(ParamAttr, optional): the parameter attribute for the filters, Default: None.
        bias_attr(ParamAttr, optional): the parameter attribute for the bias of this layer, Default: None.
        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".

    Attribute:
        **weight** (Parameter): the learnable weights of filters of this layer.

        **bias** (Parameter or None): the learnable bias of this layer.

    Returns:
        None

    Examples:

        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          with fluid.dygraph.guard():
              nodes_vector = numpy.random.random((1, 10, 5)).astype('float32')
              edge_set = numpy.random.random((1, 9, 2)).astype('int32')
              treeConv = fluid.dygraph.nn.TreeConv(
                feature_size=5, output_size=6, num_filters=1, max_depth=2)
              ret = treeConv(fluid.dygraph.base.to_variable(nodes_vector), fluid.dygraph.base.to_variable(edge_set))
    r	   r   r%  Nr5   c
                    s   t t|   || _|| _|| _|| _|| _|| _|| _	|| _
|	| _| jd| j| jg}
| j	r<| j| j	| jg| jdd| _| j| j
|
| jdd| _d S )Nr   TrP   F)rR   r2   rS   rW  Z_feature_sizerc  rZ   
_max_depthra   rc   rb   rd   rf   rh   rg   )rG   Zfeature_sizerb  rk   	max_depthrp   rm   rn   r   rN   Zw_shaperr   rH   rI   rS     s,   
zTreeConv.__init__c                 C   s   t |dtd t |dtd | jr| j| j| jdd}n| jj| jd}| jjd||| jdd	|id
| j	id | j
rY| jj| jd}| jjd|g| jgdd	|giddid n|}| jj|| jdS )Nnodes_vectorr2   edge_setFr\  r   Z	tree_conv)ZNodesVectorZEdgeSetr   r   r  r   r   r   rw   r	   r   )r   r   rW  r_  rd   r   r   r   rg   r~  rc   rh   r   rZ   )rG   r  r  r   r   rH   rH   rI   r     s@   
zTreeConv.forward)r	   r   r%  NNNr5   r   rH   rH   rr   rI   r2     s    /r2   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	r3   a  
    This interface is used to construct a callable object of the ``FLatten`` class.
    For more details, refer to code examples.
    It implements flatten a contiguous range of dims into a tensor.

    Parameters:
        start_axis(int): first dim to flatten (default = 1)
        stop_axis(int): last dim to flatten (default = -1).
    
    Returns:
        None

    Examples:

        .. code-block:: python

          import paddle
          import numpy as np

          inp_np = np.ones([5, 2, 3, 4]).astype('float32')
          inp_np = paddle.to_tensor(inp_np)
          flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
          flatten_res = flatten(inp_np)

    r	   rv   c                    s   t t|   || _|| _d S rL  )rR   r3   rS   
start_axis	stop_axis)rG   r  r  rr   rH   rI   rS     s   
zFlatten.__init__c                 C   s   t jjj|| j| jd}|S )N)r  r  )rT   ZtensorZmanipulationflattenr  r  rV  rH   rH   rI   r     s
   
zFlatten.forward)r	   rv   r   rH   rH   rr   rI   r3     s    r3   )E
__future__r   rT   Z	six.movesr    r   r
   r   r   r   r   Z	frameworkr   r   r   r   r   r   r   r   r   r   Zdata_feederr   r   r   r   rm   r   r   r   r   r   r   Zlayer_object_helperr   numpyr  r  r  osZpaddle.utils.deprecatedra  r   r    __all__ZLayerr!   r"   r/   r#   r$   r)   r%   r&   r'   r*   r(   r+   r,   r-   r.   rg  rm  r0   r1   r2   r3   rH   rH   rH   rI   <module>   sn   0   K ] Ez  |q # 5 6 |pq }^Otvo