o
    Me7-                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ eejZ							dddZdddZdS )    N)Enum)	Optimizer)
get_logger)in_dygraph_mode)ShardingOptimizerStage2)ShardingStage2)ShardingStage3)ShardingScaler)GroupShardedOptimizerStage2)GroupShardedStage2)GroupShardedStage3)GroupShardedScalerF      c
              	   C   s|  t | tjjsJ dt |tsJ d|dv sJ ddd }
tt|
|  }|du r7t|dkr7t	d	|d
v rtt
d t
d t
d t r`t|j|||d}t| ||||d} n7t|  |||d}t| ||||d} n#|dkrt rt| ||||||	d} nt| ||||||	d} nt	dt |tjjrt rt|}nt|}t
d t
d t
d | ||fS )a  
    Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
    Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.

    Args:
        model (Layer): The layer to be wrapped with group_sharded_parallel.
        optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel.
        level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`.
        scaler (GradScaler, optional): If AMP is used, you need to pass GradScaler. Defaults to None, indicating that GradScaler is not used.
        group (Group, optional): The group instance. Defaults to None, indicating that the default environment group is used.
        offload (bool, optional): Whether to use the offload function. Defaults to False, which means that the offload function is not used.
        sync_buffers (bool, optional): Whether to broadcast model buffers. It is generally used when there are registered model buffers. Defaults to False, indicating that model buffers are not used.
        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.

    Returns:
        model: A wrapper for group sharded given model.
        optimizer: A wrapper for group sharded given optimizer.
        scaler: A wrapper for group sharded given scaler.

    Examples:
        .. code-block:: python

            # required: distributed
            import paddle
            from paddle.fluid.dygraph.nn import Linear
            from paddle.distributed import fleet
            from paddle.distributed.sharding import group_sharded_parallel

            fleet.init(is_collective=True)
            group = paddle.distributed.new_group([0, 1])
            model = Linear(1000, 1000)

            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)

            # wrap sharding model, optimizer and scaler
            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)

            img, label = data
            label.stop_gradient = True
            img.stop_gradient = True

            out = model(img)
            loss = paddle.nn.functional.cross_entropy(input=out, label=label)

            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
    z2The model must be the instance of paddle.nn.Layer.zAThe optimizer must be the instance of paddle.optimizer.Optimizer.)osos_gp_g_osz%The level must be os, os_g or p_g_os.c                 S   s   | j tjkS N)ZdtypepaddleZfloat16)param r   YD:\Projects\ConvertPro\env\Lib\site-packages\paddle/distributed/sharding/group_sharded.pycheck_dtype   s   z+group_sharded_parallel.<locals>.check_dtypeNr   z Please enter the correct scaler.)r   r   z******************************z6Sharded level os uses sharded level os_g achieved now.)paramsZoptimgroupoffload)r   sync_buffersbuffer_max_sizer   )	optimizerr   r   segment_sizer   	sync_commzPlease enter the correct level.zIf there is a communication hang using group sharded, please check whether the communication operations of each process are unified.)
isinstancer   nnZLayerr   listfilter
parameterslen
ValueErrorlogger_infor   r
   Z_parameter_listr   r   r   r   r   ampZ
GradScalerr   r	   )modelr   levelZscalerr   r   r   r   r   r    r   Zparams_fp16r   r   r   group_sharded_parallel8   s   @









r-   c                 C   s   t d tj|rJ d|tj|dd tj|d}t| t	t
fr1t| j | n"t| ttfrO| jr=dnd}| j|d t| j | ntd|d	urpt|d
s`J dtj|d}t|j | t d d	S )a  
    Group sharded encapsulated model and optimizer state saving module.

    Note:
        If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel.

    Args:
        model (Layer): A wrapper for group sharded given model.
        output (str): Save directory.
        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.

    Examples:
        .. code-block:: python

            # required: distributed
            import paddle
            from paddle.fluid.dygraph.nn import Linear
            from paddle.distributed import fleet
            from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model

            fleet.init(is_collective=True)
            group = paddle.distributed.new_group([0, 1])
            model = Linear(1000, 1000)

            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)

            # wrap sharding model, optimizer and scaler
            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)

            img, label = data
            label.stop_gradient = True
            img.stop_gradient = True

            out = model(img)
            loss = paddle.nn.functional.cross_entropy(input=out, label=label)

            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

            # save model and optimizer state_dict
            save_group_sharded_model(model, optimizer, output=output_dir)
    zC==========Begin to save group sharded model and optimizer==========z7Saving directory ({}) should be a directory, not a fileT)exist_okzmodel.pdmodelF)convert2cpuzBPlease use the layer which is wrapped with group_sharded_parallel.N_optimzFPlease use the optimizer which is wrapped with group_sharded_parallel.zmodel.pdoptzA==========End to save group sharded model and optimizer==========)r(   r)   r   pathisfileformatmakedirsjoinr!   r   r   r   saveZ_layerZ
state_dictr   r   Z_offloadZget_all_parametersr'   hasattrr0   )r+   outputr   Zoutput_modelr/   Z
output_optr   r   r   save_group_sharded_model   s<   -r9   )NNFFr   r   Fr   )r   loggingenumr   r   Zpaddle.optimizerr   Z"paddle.distributed.utils.log_utilsr   Zpaddle.fluid.frameworkr   ZTpaddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2r   Z?paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2r   Z?paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3r   Z>paddle.distributed.fleet.meta_parallel.sharding.sharding_utilsr	   ZNpaddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2r
   ZDpaddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2r   ZDpaddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3r   ZCpaddle.distributed.fleet.meta_parallel.sharding.group_sharded_utilsr   WARNINGr(   r-   r9   r   r   r   r   <module>   s4   

 