o
    Qe2                     @   sH   d dl mZ ddlmZ ddlmZ ddlmZ g ZG dd deZdS )	   )	Optimizer   )core)	framework)Variablec                       sJ   e Zd ZdZdZ						d fdd	Zdd	 Zd
d Zdd Z  Z	S )Adagrada  
    The Adaptive Gradient optimizer (Adagrad for short) use an optimization described 
    in paper: `Adaptive Subgradient Methods for Online Learning and
    Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.

    The parameter ``param_out`` update rule with gradient ``grad``:

    .. math::

        moment\_out &= moment + grad * grad

        param\_out &= param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}


    The original paper does not have the ``epsilon`` attribute. It is added here
    in our implementation as also proposed `Per-parameter adaptive learning rate
    methods <http://cs231n.github.io/neural-networks-3/#ada>`_
    for numerical stability to avoid the division by zero error.

    Args:
        learning_rate (float|Tensor): The learning rate used to update ``Parameter``.
            It can be a float value or a ``Variable`` with a float type.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-06.
	parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
	    This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in paramter groups \
            represents the scale of base learning_rate. \
	    The default value is None in static mode, at this time all parameters will be updated.
	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
	    It canbe a float value as coeff of L2 regularization or \
	    :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
	    If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
	    the regularization setting here in optimizer will be ignored for this parameter. \
	    Otherwise, the regularization setting here in optimizer will take effect. \
	    Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies, 
            ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, 
            meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
        initial_accumulator_value (float, optional): Initial value for moment accumulator.
            The default value is 0.0.

    Examples:
        .. code-block:: python

            import paddle

            inp = paddle.rand(shape=[10, 10])
            linear = paddle.nn.Linear(10, 10)
            out = linear(inp)
            loss = paddle.mean(out)
            adagrad = paddle.optimizer.Adagrad(learning_rate=0.1,
                    parameters=linear.parameters())
            out.backward()
            adagrad.step()
            adagrad.clear_grad()

            #Note that the learning_rate of linear_2 is 0.01.
            linear_1 = paddle.nn.Linear(10, 10)
            linear_2 = paddle.nn.Linear(10, 10)
            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            out = linear_1(inp)
            out = linear_2(out)
            loss = paddle.mean(out)
            adagrad = paddle.optimizer.Adagrad(
                learning_rate=0.1,
                parameters=[{
                    'params': linear_1.parameters()
                }, {
                    'params': linear_2.parameters(),
                    'weight_decay': 0.001,
                    'learning_rate': 0.1,
                }],
                weight_decay=0.01)                   
            out.backward()
            adagrad.step()
            adagrad.clear_grad()

    Zmomentư>N        c                    sT   |d usJ |d usJ t t| j|||||d d| _|| _|| _||d| _d S )N)learning_rate
parametersweight_decay	grad_clipnameZadagrad)epsiloninitial_accumulator_value)superr   __init__type_epsilonr   _default_dict)selfr
   r   r   r   r   r   r   	__class__ HD:\Projects\ConvertPro\env\Lib\site-packages\paddle/optimizer/adagrad.pyr   n   s   

zAdagrad.__init__c                 C   sF   t |tjsJ t |tr| |}|D ]}| j| j|| jd qd S )N)Z
fill_value)
isinstancer   Blockdict_update_param_groupZ_add_accumulator_moment_acc_strr   )r   blockr   pr   r   r   _create_accumulators   s   

zAdagrad._create_accumulatorsc                 C   sx   t |tjsJ t |tr| |}| | j|d }|j| j|d |d || 	|d|d |dd| j
idd}|S )N    r   )ParamZGradZMomentZLearningRate)ZParamOutZ	MomentOutr   T)r   ZinputsZoutputsattrsZstop_gradient)r   r   r   r   r   Z_get_accumulatorr   Z	append_opr   Z_create_param_lrr   )r   r    Zparam_and_gradZ
moment_accZ
adagrad_opr   r   r   _append_optimize_op   s$   


zAdagrad._append_optimize_opc                 C   s6   | d| jd | _| d| jd | _| d}|S )Nr   r   params)getr   r   r   )r   r   r   r   r   r      s   
zAdagrad._update_param_group)r   NNNNr	   )
__name__
__module____qualname____doc__r   r   r"   r&   r   __classcell__r   r   r   r   r      s    Tr   N)	Z	optimizerr   Zfluidr   r   Zfluid.frameworkr   __all__r   r   r   r   r   <module>   s   