o
    Qe                     @   s   d dl mZ ddlmZ ddlmZ ddlmZmZ ddlm	Z	 ddl
mZmZ dd	lZdd
lmZ ddlmZ ddlmZ ddlmZmZ g ZG dd deZd	S )   )	Optimizer   )core)	framework)Variable
name_scope)no_grad    )_C_ops_legacy_C_opsN)LayerHelper)unique_name)layers)_in_legacy_dygraphin_dygraph_modec                       sR   e Zd ZdZ						d fdd	Zdd Zd	d
 Zedd Zdd Z	  Z
S )SGDa1	  
    Optimizer of the stochastic gradient descent algorithm.

    .. math::

        param\_out = param - learning\_rate * grad

    Parameters:
        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
            It canbe a float value as coeff of L2 regularization or \
            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` . 
        
    Examples:
        .. code-block:: python

            import paddle

            inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
            linear = paddle.nn.Linear(10, 10)
            inp = paddle.to_tensor(inp)
            out = linear(inp)
            loss = paddle.mean(out)
            sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
            out.backward()
            sgd.step()
            sgd.clear_grad()

    MbP?NFc                    s@   |d u rt dtt| j|||||d d| _|| _i | _d S )Nzlearning_rate is not set)learning_rate
parametersweight_decay	grad_clipnamesgd)
ValueErrorsuperr   __init__type_multi_precision_master_weights)selfr   r   r   r   multi_precisionr   	__class__ DD:\Projects\ConvertPro\env\Lib\site-packages\paddle/optimizer/sgd.pyr   L   s   
zSGD.__init__c                 C   s   |j | jv r| j|j  }|S t| jtsJ |j d }t|}tj||j	dddd}| jj
 }|jdd|gid|gi|jtjjjd	d
 || j|j < |S )NZ_fp32_masterr	   Zfloat32T)r   shapevaluedtypeZpersistablecastXZOut)Zin_dtypeZ	out_dtype)r   inputsoutputsattrs)r   r   
isinstancehelperr   r   generater   Zcreate_global_varr%   Zstartup_programZglobal_block	append_opr'   r   VarDescVarTypeZFP32)r   paramvarvar_nameblockr#   r#   r$   _create_master_weight^   s,   

zSGD._create_master_weightc                 C   st   t |tjsJ t |tr| |}|D ]#}| jr'|jtjj	j
kr'| |}q|jtjj	j
kr7| js7td qd S )NzAccumulating with FP16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Adam optimizer.)r-   r   Blockdict_update_param_groupr   r'   r   r1   r2   FP16r7   warningswarn)r   r6   r   pZmaster_pr#   r#   r$   _create_accumulatorsv   s   


zSGD._create_accumulatorsc           
      C   s  t |tr
| |}| jo|d jtjjjk}|r!| j	|d j
 nd }| |}t r:t|d ||d || d S t rOt|d ||d ||d | d S t |tjsWJ |d |d |d}d|d i}d|i}|ru||d< ||d< |j| j|||dd	}	|	S )
Nr	   r   )ParamZGradZLearningRateZParamOutr    ZMasterParamZMasterParamOutT)r   r*   r+   r,   Zstop_gradient)r-   r9   r:   r   r'   r   r1   r2   r;   r   r   Z_create_param_lrr   r
   Zsgd_r   r   r   r   r8   r0   r   )
r   r6   Zparam_and_gradZfind_masterZmaster_weightlrr*   r+   r,   Zsgd_opr#   r#   r$   _append_optimize_op   sN   




zSGD._append_optimize_opc                 C   s   | d}|S )Nparams)get)r   r   r#   r#   r$   r:      s   
zSGD._update_param_group)r   NNNFN)__name__
__module____qualname____doc__r   r7   r?   r   rB   r:   __classcell__r#   r#   r!   r$   r      s    .
+r   )Z	optimizerr   Zfluidr   r   Zfluid.frameworkr   r   Zfluid.dygraphr   Zpaddler
   r   r<   Zfluid.layer_helperr   r   r   r   r   __all__r   r#   r#   r#   r$   <module>   s   