o
    QeP.                    @   s  d dl Z d dlmZ d dlZd dlmZmZ ddlT ejej	ej
ejejejejejejejejejejejejgZg dZG dd deZG d	d
 d
Zdd Zdd Zdd ZG dd dZG dd dZG dd dZ G dd dZ!G dd dZ"ej#ddddddfd d!Z$dS )"    N)Enum)TracerEventTypeTracerMemEventType   )*)Z	allreduce	broadcastZrpcc                   @   s0   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
S )
SortedKeysa  
    SortedKeys is used to specify how to sort items when printing :ref:`summary <api_paddle_profiler_profiler_summary>` table.

    The meaning of each SortedKeys is as following

    - **SortedKeys.CPUTotal** :  Sorted by CPU total time.

    - **SortedKeys.CPUAvg**  : Sorted by CPU average time.

    - **SortedKeys.CPUMax**  : Sorted by CPU max time.

    - **SortedKeys.CPUMin**  : Sorted by CPU min time.

    - **SortedKeys.GPUTotal**  : Sorted by GPU total time.

    - **SortedKeys.GPUAvg**  : Sorted by GPU average time.

    - **SortedKeys.GPUMax**  : Sorted by GPU max time.

    - **SortedKeys.GPUMin**  : Sorted by GPU min time.
    r   r                     N)__name__
__module____qualname____doc__CPUTotalCPUAvgCPUMaxCPUMinGPUTotalGPUAvgGPUMaxGPUMin r   r   RD:\Projects\ConvertPro\env\Lib\site-packages\paddle/profiler/profiler_statistic.pyr   #   s    r   c                   @   s@   e Zd ZdZdd Zdd Zedd Zedd	 Zd
d Z	dS )HostStatisticNodez?
    Wrap original node for calculating statistic metrics.
    c                 C   s:   || _ g | _g | _d| _d| _d| _d| _d| _d| _d S Nr   )	hostnodechildren_noderuntime_nodecpu_timeself_cpu_timegpu_timeself_gpu_timegeneral_gpu_timeself_general_gpu_time)selfr   r   r   r   __init__H   s   
zHostStatisticNode.__init__c                 C   s\  | j D ]}|  q| jD ]}|  q| jj| jj | _| j| _| j D ]}|  j|j7  _|  j	|j	7  _	|  j|j|j 8  _q$| jD ]-}|  j|j|j 8  _|  j|j7  _|  j
|j7  _
|  j	|j	7  _	|  j|j	7  _qE| jjD ]4}|jtjkr|  j|j|j 7  _|  j
|j|j 7  _
|  j	|j|j 7  _	|  j|j|j 7  _qwd S N)r    cal_statisticr!   r   end_nsstart_nsr"   r#   r$   r&   r%   r'   device_nodetyper   Kernel)r(   childrtZdevicer   r   r   r+   S   s.   





zHostStatisticNode.cal_statisticc                 C      | j jS r*   )r   r,   r(   r   r   r   r,   k      zHostStatisticNode.end_nsc                 C   r3   r*   )r   r-   r4   r   r   r   r-   o   r5   zHostStatisticNode.start_nsc                 C   s   t | j|S r*   )getattrr   r(   namer   r   r   __getattr__s   s   zHostStatisticNode.__getattr__N)
r   r   r   r   r)   r+   propertyr,   r-   r9   r   r   r   r   r   C   s    

r   c                 C   sf   t t}|  D ]'\}}g }|| || }|r0| }|| |jD ]}|| q&|sq	|S r*   )collectionsdefaultdictlistitemsappendpopr    )	nodetreesresults	thread_idrootnodestack
threadlistcurrent_node	childnoder   r   r   traverse_treew   s   



rI   c                 C   s^   g }g }| |  |r-| }|jD ]}| | q|jD ]}|jD ]}| | q"q|s|S )zD
    Get all device nodes called in the time range of hostnode.
    )r?   r@   r    r!   r.   )r   rE   device_nodesrG   rH   runtimenode
devicenoder   r   r   get_device_nodes   s   



rM   c                 C   s  i }t t}t t}|  D ]b\}}g }|| t|}g }|| |||< || }	|| }
|rr| }|	| | }|
| |jD ]}|| t|}|j| || qI|jD ]}t|}|j| qc|s4q| D ]\}}|	  qw||fS )zn
    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
    )
r;   r<   r=   r>   r?   r   r@   r    r!   r+   )rA   Znode_statistic_treerB   Z
newresultsrC   rD   rE   root_statistic_nodeZnewstackrF   ZnewthreadlistrG   Zcurrent_statistic_noderH   Zchild_statistic_noderK   Zruntime_statistic_noder   r   r   	wrap_tree   sD   









rO   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )TimeRangeSummaryzO
    Analyse time ranges for each TracerEventType, and summarize the time.
    c                 C   sH   t t| _t dd | _t t| _t dd | _t t| _d S )Nc                   S   
   t tS r*   r;   r<   r=   r   r   r   r   <lambda>       z+TimeRangeSummary.__init__.<locals>.<lambda>c                   S   rQ   r*   )r;   r<   intr   r   r   r   rS         
 )	r;   r<   r=   CPUTimeRangeGPUTimeRangerU   CPUTimeRangeSumGPUTimeRangeSum
call_timesr4   r   r   r   r)      s   zTimeRangeSummary.__init__c              	   C   s  t |}| D ]\}}tt}tdd }|dd D ]V}||j |j|jf | j	|j  d7  < |j
D ]:}||j |j|jf | j	|j  d7  < |jD ]}	||	j |	j |	j |	j|	jf | j	|	j  d7  < qTq9q| D ]\}
}t|dd}t| j|
 |dd| j|
< qy| D ]-\}}| D ]$\}
}| D ]\}}t|dd}t| j| |
 |dd| j| |
< qqqq| j D ]\}
}t|| j|
< q| j D ]\}}| D ]\}
}t|| j| |
< qqdS )zm
        Analysis node trees in profiler result, and get time range for different tracer event type.
        c                   S   s   t dd S )Nc                   S   rQ   r*   rR   r   r   r   r   rS      rT   z:TimeRangeSummary.parse.<locals>.<lambda>.<locals>.<lambda>)r;   r<   r   r   r   r   rS      s    z(TimeRangeSummary.parse.<locals>.<lambda>r   NFZ	is_sortedT)rI   r>   r;   r<   r=   r/   r?   r-   r,   r[   r!   r.   	device_id	stream_idmerge_self_rangesmerge_rangesrW   rX   
sum_rangesrY   rZ   )r(   rA   thread2hostnodesthreadid	hostnodesrW   rX   r   rK   rL   
event_typeZtime_rangesr]   device_time_rangesZevent_time_rangesr^   r   r   r   parse   sl   









zTimeRangeSummary.parsec                 C   s
   | j  S r*   )rX   keysr4   r   r   r   get_gpu_devices      
z TimeRangeSummary.get_gpu_devicesc                 C   s   | j | | S r*   )rZ   )r(   r]   re   r   r   r   get_gpu_range_sum   s   z"TimeRangeSummary.get_gpu_range_sumc                 C   s
   | j | S r*   )rY   )r(   re   r   r   r   get_cpu_range_sum   rj   z"TimeRangeSummary.get_cpu_range_sumN)	r   r   r   r   r)   rg   ri   rk   rl   r   r   r   r   rP      s    
-rP   c                   @   s    e Zd ZdZdd Zdd ZdS )DistributedSummaryz
    Analysis communication and computation time range, and their overlap.
    The computation time is all kernel except kernels for communication like nccl.
    c                 C   s.   g | _ g | _g | _g | _g | _d| _d| _d S r   )cpu_communication_rangegpu_communication_rangecommunication_rangecomputation_rangeoverlap_range	cpu_calls	gpu_callsr4   r   r   r   r)     s   
zDistributedSummary.__init__c           	         s  t |}| D ]\}}|dd D ]  jtjkr>| j j jf t	 }|D ]}|jtj
kr<| j|j|jf q*q jtjkrst fddtD rs| j j jf t	 }|D ]}|jtj
krq| j|j|jf q_q jD ]*}|jD ]$}|jtj
krd|j v r| j|j|jf q{| j|j|jf q{qvqqtt| j| _tt| j| _t| jdd| _t| jdd| _t| j| jdd| _t| jdd| _t| j| jdd| _dS )	zH
        Collect all communication and computation time ranges.
        r   Nc                    s   g | ]	}| j  v qS r   )r8   lower).0r8   r   r   r   
<listcomp>#  s    z,DistributedSummary.parse.<locals>.<listcomp>ZncclFr\   T)rI   r>   r/   r   Communicationrn   r?   r-   r,   rM   r0   ro   Operatorany_CommunicationOpNamer!   r.   r8   ru   rq   lensetrs   rt   r_   r`   rp   Zintersection_rangesrr   )	r(   rA   rb   rc   rd   rJ   r.   rK   rL   r   rw   r   rg     s   





%zDistributedSummary.parseN)r   r   r   r   r)   rg   r   r   r   r   rm     s    	rm   c                   @   sr   e Zd ZdZG dd dZG dd dZG dd dZdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd ZdS )EventSummaryzT
    Analyse operator event in profiling data, correlate with its device event.
    c                   @   s0   e Zd Zdd Zedd Zdd Zdd Zd	S )
zEventSummary.DeviceItemc                 C   s&   || _ d| _d| _d| _td| _d S Nr   inf)r8   callr$   max_gpu_timefloatmin_gpu_timer7   r   r   r   r)   S  s
   z EventSummary.DeviceItem.__init__c                 C      | j | j S r*   r$   r   r4   r   r   r   avg_gpu_timeZ     z$EventSummary.DeviceItem.avg_gpu_timec                 C   2   || j kr|| _ || jk r|| _|  j|7  _d S r*   r   r   r$   r(   timer   r   r   add_gpu_time^  
   

z$EventSummary.DeviceItem.add_gpu_timec                 C   s$   |  j d7  _ | |j|j  d S Nr   )r   r   r,   r-   r(   noder   r   r   add_iteme  s   z EventSummary.DeviceItem.add_itemN)r   r   r   r)   r:   r   r   r   r   r   r   r   
DeviceItemQ  s    
r   c                   @   `   e Zd Zdd Zedd Zedd Zedd Zd	d
 Zdd Z	dd Z
dd Zdd ZdS )zEventSummary.OperatorItemc                 C   s^   || _ d| _d| _d| _d| _td| _d| _td| _i | _	i | _
d| _td| _d| _d S r   )r8   r   r"   r$   max_cpu_timer   min_cpu_timer   r   devicesoperator_innersr&   min_general_gpu_timemax_general_gpu_timer7   r   r   r   r)   k  s   



z"EventSummary.OperatorItem.__init__c                 C   r   r*   r"   r   r4   r   r   r   avg_cpu_timez  r   z&EventSummary.OperatorItem.avg_cpu_timec                 C   r   r*   r   r4   r   r   r   r   ~  r   z&EventSummary.OperatorItem.avg_gpu_timec                 C   r   r*   r&   r   r4   r   r   r   avg_general_gpu_time  r   z.EventSummary.OperatorItem.avg_general_gpu_timec                 C   r   r*   r   r   r"   r   r   r   r   add_cpu_time  r   z&EventSummary.OperatorItem.add_cpu_timec                 C   r   r*   r   r   r   r   r   r     r   z&EventSummary.OperatorItem.add_gpu_timec                 C   r   r*   r   r   r&   r   r   r   r   add_general_gpu_time  r   z.EventSummary.OperatorItem.add_general_gpu_timec                 C      |  j d7  _ d S r   r   r4   r   r   r   add_call     z"EventSummary.OperatorItem.add_callc                 C   s   |    | |j | |j | |j |jD ]!}|jt	j
kr:|j| jvr1t|j| j|j< | j|j | q|jD ] }|jD ]}|j}|| jvrUt|| j|< | j| | qCq>d S r*   )r   r   r"   r   r$   r   r&   r    r/   r   rz   r8   r   r   OperatorItemr   r!   r.   r   r   )r(   r   r1   rK   rL   r8   r   r   r   r     s*   




z"EventSummary.OperatorItem.add_itemNr   r   r   r)   r:   r   r   r   r   r   r   r   r   r   r   r   r   r   i  s    


r   c                   @   r   )zEventSummary.GeneralItemc                 C   sR   || _ d| _d| _d| _td| _d| _d| _td| _d| _	td| _
d| _d S r   )r8   r   r"   r   r   r   r$   r   r   r&   r   r   r7   r   r   r   r)     s   



z!EventSummary.GeneralItem.__init__c                 C   r   r*   r   r4   r   r   r   r     r   z%EventSummary.GeneralItem.avg_cpu_timec                 C   r   r*   r   r4   r   r   r   r     r   z%EventSummary.GeneralItem.avg_gpu_timec                 C   r   r*   r   r4   r   r   r   r     r   z-EventSummary.GeneralItem.avg_general_gpu_timec                 C   r   r*   r   r   r   r   r   r     r   z%EventSummary.GeneralItem.add_cpu_timec                 C   r   r*   r   r   r   r   r   r     r   z%EventSummary.GeneralItem.add_gpu_timec                 C   r   r*   r   r   r   r   r   r     r   z-EventSummary.GeneralItem.add_general_gpu_timec                 C   r   r   r   r4   r   r   r   r     r   z!EventSummary.GeneralItem.add_callc                 C   s0   |    | |j | |j | |j d S r*   )r   r   r"   r   r$   r   r&   r   r   r   r   r     s   z!EventSummary.GeneralItem.add_itemNr   r   r   r   r   GeneralItem  s    


r   c                 C   s:   i | _ tt| _i | _tt| _i | _i | _i | _	d S r*   )
r>   r;   r<   dictthread_itemsuserdefined_itemsuserdefined_thread_itemsmodel_perspective_itemsmemory_manipulation_itemskernel_itemsr4   r   r   r   r)     s   
zEventSummary.__init__c                 C   sX  t |\}}| D ]Q\}}|dd D ]?}|jtjkr!| | |jtjks-|jtjkrSd|j	 v sBd|j	 v sBd|j	 v rH| 
| q|jtjkrS| | q| |d  q
| D ]I\}}t }|| |r| }	|	jD ]0}
|
jtjks|
jtjks|
jtjks|
jtjkr| |
 qv|
jtjkr| |
 ||
 qv|soq`dS )z;
        Analysis operator event in the nodetress.
        r   NZmemcpyZ
memorycopymemsetr   )rO   r>   r/   r   rz   add_operator_itemUserDefinedPythonUserDefinedr8   ru   add_memory_manipulation_itemadd_userdefined_itemadd_kernel_itemr;   dequer?   popleftr    Forward
DataloaderBackwardOptimizationadd_model_perspective_itemProfileStep)r(   rA   Znode_statistic_treesZthread2host_statistic_nodesrc   Zhost_statistic_nodesZhost_statistic_noderN   r   rG   r1   r   r   r   rg     sB   




zEventSummary.parsec                 C   z   |j | jvrt|j | j|j < | j|j  | |j | j|j vr/t|j | j|j |j < | j|j |j  | d S r*   )r8   r>   r   r   r   r   rC   )r(   Zoperator_noder   r   r   r     s   
zEventSummary.add_operator_itemc                 C   r   r*   )r8   r   r   r   r   r   rC   )r(   Zuserdefined_noder   r   r   r   '  s*   

z!EventSummary.add_userdefined_itemc                 C   s6   |j | jvrt|j | j|j < | j|j  | d S r*   )r8   r   r   r   r   )r(   Zmemory_manipulation_noder   r   r   r   7  s   z)EventSummary.add_memory_manipulation_itemc                 C   s   |j tjkr	d}n&|j tjkrd}n|j tjkrd}n|j tjkr$d}n|j tjkr-d}nd S || jvr<t	|| j|< | j| 
| d S )Nr   r   r   r   r   )r/   r   r   r   r   r   r   r   r   r   r   )r(   Zmodel_perspective_noder8   r   r   r   r   ?  s   
z'EventSummary.add_model_perspective_itemc                 C   sR   t |}|D ] }|jtjkr&|j}|| jvrt|| j|< | j| | qd S r*   )	rM   r/   r   r0   r8   r   r   r   r   )r(   Z	root_noderJ   r.   r8   r   r   r   r   P  s   
zEventSummary.add_kernel_itemN)r   r   r   r   r   r   r   r)   rg   r   r   r   r   r   r   r   r   r   r   L  s    H9$r   c                   @   s6   e Zd ZdZG dd dZdd Zdd Zdd	 Zd
S )MemorySummaryz2
    Analyse memory events in profiling data.
    c                   @   s   e Zd ZdddZdd ZdS )zMemorySummary.MemoryItem	Allocatedc                 C   s4   || _ || _d| _d| _d| _d| _d| _|| _d S r   )
event_nameplaceallocation_count
free_countallocation_size	free_sizeincrease_sizememory_type)r(   r   r   r   r   r   r   r)   a  s   
z!MemorySummary.MemoryItem.__init__c                 C   s~   |t jks
|t jkr|  jd7  _|  j|7  _n|t jks#|t jkr2|  jd7  _|  j|8  _nt	d | j| j | _
d S )Nr   zNo corresponding type.)r   AllocateReservedAllocater   r   FreeReservedFreer   r   printr   )r(   sizeZallocation_typer   r   r   add_memory_recordk  s   z*MemorySummary.MemoryItem.add_memory_recordN)r   )r   r   r   r)   r   r   r   r   r   
MemoryItem_  s    

r   c                 C   s4   t t| _t t| _t t| _t t| _d S r*   )r;   r<   r   allocated_itemsreserved_itemsrU   peak_allocation_valuespeak_reserved_valuesr4   r   r   r   r)   x  s   zMemorySummary.__init__c                 C   s
  |j D ]}|jtjks|jtjkr6|| j|j vr't||jd| j|j |< | j|j | 	|j
|j n0|jtjksB|jtjkrf|| j|j vrXt||jd| j|j |< | j|j | 	|j
|j t| j|j |j| j|j< t| j|j |j| j|j< qd S )Nr   Reserved)Zmem_noder/   r   r   r   r   r   r   r   r   Zincrease_bytesr   r   r   maxr   Zpeak_allocatedr   Zpeak_reserved)r(   r   r   Zmemnoder   r   r   _analyse_node_memory  sZ   

z"MemorySummary._analyse_node_memoryc                 C   sr   t |}| D ].\}}|dd D ]#}|jtjkrq|jtjkr.|jD ]	}| |j| q$| |j| qqdS )z8
        Analyse memory event in the nodetress.
        r   N)	rI   r>   r/   r   OperatorInnerrz   r    r   r8   )r(   rA   rb   rc   Z
host_nodesZ	host_noder1   r   r   r   rg     s   
zMemorySummary.parseN)r   r   r   r   r   r)   r   rg   r   r   r   r   r   Z  s    r   c                   @   s   e Zd ZdZdd ZdS )StatisticDataz$
    Hold all analysed results.
    c                 C   s`   || _ || _t | _t | _t | _t | _	| j
| | j
| | j
| | j	
| d S r*   )
node_trees
extra_inforP   time_range_summaryr   event_summaryrm   distributed_summaryr   memory_summaryrg   )r(   r   r   r   r   r   r)     s   zStatisticData.__init__N)r   r   r   r   r)   r   r   r   r   r     s    r   TFmsd   K   c           L         s  ddl m} 	 d dgdg  gdl fdd	}	dd	 }
g fd
d}dmdd}dndd}| jtj}|d u sC|j|v rddg}d}d}|	| |dd  D ]}|	| qUd }d }d }||
|d || ||j|  || d|t| j	d g}||j|  d|t| j	d g}||j|  | j
 D ] }t| j|tj}|| }d|||g}||j|  q|| |d |d|  |d |d |dkrdS |d u s|j|v rg d}dgdg  gd}|D ]}|	| qd }d }d }||
|d  |d!| || ||j|  || tt}tt}tt}tt}|| jj || jj | jj D ]\}}|tjkrj|||< q\| jjrt| jj|tj< | jj|tj< tjtjtjtjfD ]*}t| d"d } ||v r| | j!j"v r| j!j"|  j#||< | j!j"|  j$||< qtt%}!| jj& D ]\}"}#|# D ]\}}$t'|!| |$d#d$|!|< qɐq|! D ]\}}$t|$||< q| jj(rt| jj(|tj< | jj)|tj< t*| d%d& d#d'}%|%d \}}&d(t| d"d || ||&|d)|t|&| g}||j|  |%dd  D ](\}}&d*t| d"d || ||&|d)|t|&| g}||j|  q;|| g d+}||j|  || | D ](\}}&d*t| d"d || ||&|d)|t|&| g}||j|  q{|| |d, |d|  |d |d |d u s|j+|v rQ| j!j"}'t,|'dkrQg }(d})d}*| j!j"d- j-}+d.D ]},|,|'v re|'|, }-|+dkrd}.nt|-j-|+ }.d-|,v rd(|,nd*|,},d(|,|-j#d/||-j$|d)||-j.|d)||-j/|d)||-j0|d)|t|-j$| d/||-j-|d)||-j1|d)||-j2|d)||-j3|d)||.g}|(4| d-|,vre|)|-j$7 })|*|-j-7 }*q||) }/|+|* }0|+dkrwd}.nt|0|+ }.d0dd1||/|d)|t|/| d1||0|d)||.g}|(4| d2}1d3}2d3}3|(D ]<}t5|d trt,t|d |1krt,t|d }1t,|d |2krt,|d }2t,|d4 |3krt,|d4 }3qg d5}dgdg  gd6}|	| |	|1 |	|2 |	|3 d }d }d }||
|d7 |d!| || ||j|  || |(D ]
}||j|  q0|| |d8 |d|  |d |d |d u s\|j6|v r$| jj7r$g d9}dgdg  gd}|D ]}|	| qsd }d }d }||
|d: |d!| || ||j|  || t| jj7}4t| jj8}5t| jj9}6d-|||d)|t|| g}||j|  d;||4|d)|t|4| g}||j|  d<||5|d)|t|5| g}||j|  d=||6|d)|t|6| g}||j|  || |d> |d|  |d |d |d u s/|j:|v ro| j!jrog }(d?}|d#krB| j!j;}7nd@| j!ji}7|7 D ]e\}8}9|(4dA|8 |t<j=krkt*|9 dBd& d#d'}%n{|t<j>kr}t*|9 dCd& d#d'}%ni|t<j?krt*|9 dDd& d#d'}%nW|t<j@krt*|9 dEd& dF}%nF|t<jAkrt*|9 dGd& d#d'}%n4|t<jBkrt*|9 dHd& d#d'}%n"|t<jCkrt*|9 dId& d#d'}%n|t<jDkrt*|9 dJd& dF}%d}:d};|%D ]\},}-|:|-j$7 }:|;|-jE7 };q|%D ]\},}-|:dkrd}<nt|-j$|: }<|;dkrd}.nt|-jE|; }.|,|-j#d/||-j$|d)||-j.|d)||-j/|d)||-j0|d)||<d/||-jE|d)||-jF|d)||-jG|d)||-jH|d)||.g}|(4| |r|-jI D ]\}=}>|-j$dkr|d}<nt|>j$|-j$ }<|-jEdkrd}.nt|>jE|-jE }.t,|=d |kr|=d |dK  }=|=dL7 }=d*|=|>j#d/||>j$|d)||>j.|d)||>j/|d)||>j0|d)||<d/||>jE|d)||>jF|d)||>jG|d)||>jH|d)||.g}|(4| |>jJ D ]W\}?}@|>jEdkrd}.nt|@j-|>jE }.t,|?dM |kr"|?d |dN  }?|?dL7 }?dO|?|@j#dPd/||@j-|d)||@j1|d)||@j2|d)||@j3|d)||.g}|(4| qqo|-jJ D ]W\}?}@|-jEdkrdd}.nt|@j-|-jE }.t,|?d |kr|?d |dK  }?|?dL7 }?d*|?|@j#dPd/||@j-|d)||@j1|d)||@j2|d)||@j3|d)||.g}|(4| qWqqLd2}1d3}2d3}3|(D ]D}t5|trŐqt5|d trt,t|d |1krt,t|d }1t,|d |2krt,|d }2t,|d4 |3krt,|d4 }3qg d5}dgdg  g|	| |	|1 |	|2 |	|3 d }d }d }||
|dQ |d!| || ||j|  || |(D ]}t5|trZ||
|| qI||j|  qI|| |d |d |d u sz|jK|v 	r| j!jL	rg }(| j!jL}A|t<jBkrt*|A dRd& d#d'}%n.|t<jCkrt*|A dSd& d#d'}%n|t<jDkrt*|A dTd& dF}%nt*|A dUd& d#d'}%d}B|%D ]
\},}-|B|-j-7 }Bq|%D ]<\},}-|Bdkrd}.nt|-j-|B }.|,|-j#d/||-j-|d)||-j1|d)||-j2|d)||-j3|d)||.g}|(4| qg dV}dW}d2}1d3}3|(D ]-}t5|d t	r<t,t|d |1k	r<t,t|d }1t,|d |3k	rKt,|d }3	qdgdg  g|	| |	|1 |	|3 d }d }d }||
|dX |d!| || ||j|  || tMNdY}C|(D ];}|CO|d }D|D	r|DPd|DPd },n|d },t,|,|k	r|,d |d4  dL |d< n|,|d< ||j|  	q|| |d |d |d u 	s|jQ|v r| j!jRrg }(| j!jR}E| j!j"d- jE}+|E D ]_\},}-|+dk
rd}.nt|-jE|+ }.|,|-j#d/||-j$|d)||-j.|d)||-j/|d)||-j0|d)|t|-j$| d/||-jE|d)||-jF|d)||-jG|d)||-jH|d)||.g}|(4| 	qg d5}d}d2}1d3}2d3}3|(D ]K}t,|d |k
r{t,|d }t5|d t
rt,t|d |1k
rt,t|d }1t,|d |2k
rt,|d }2t,|d4 |3k
rt,|d4 }3
qjdgdg  g|	| |	|1 |	|2 |	|3 d }d }d }||
|dZ |d!| || ||j|  || |(D ]
}||j|  
q|| |d |d |d u s|jS|v r| j!jTrg }(| j!j"d- jE}+|d#kr5| j!jU}Fnd@| j!jTi}F|F D ]\}8}9|(4dA|8 |t<j=kr]t*|9 d[d& d#d'}%n{|t<j>krot*|9 d\d& d#d'}%ni|t<j?krt*|9 d]d& d#d'}%nW|t<j@krt*|9 d^d& dF}%nF|t<jAkrt*|9 d_d& d#d'}%n4|t<jBkrt*|9 d`d& d#d'}%n"|t<jCkrt*|9 dad& d#d'}%n|t<jDkrt*|9 dbd& dF}%|%D ]_\},}-|+dkrd}.nt|-jE|+ }.|,|-j#d/||-j$|d)||-j.|d)||-j/|d)||-j0|d)|t|-j$| d/||-jE|d)||-jF|d)||-jG|d)||-jH|d)||.g}|(4| qڐq?d}d2}1d3}2d3}3|(D ]S}t5|trPqFt,|d |kr_t,|d }t5|d trzt,t|d |1krzt,t|d }1t,|d |2krt,|d }2t,|d4 |3krt,|d4 }3qFg d5}dgdg  g|	| |	|1 |	|2 |	|3 d }d }d }||
|dc |d!| || ||j|  || |(D ]}t5|tr||
|| q||j|  q|d |d |d u s|jV|v r| jWjXs| jWjYr| jWjX D ]\}G}Hg }(t*|H ddd& d#d'}%|%D ]\}I}-|I|-jZ|-j[|-j\|-j]|-j^|-j_g}|(4| q3t*| jWjY|G  ded& d#d'}J|JD ]\}I}-|I|-jZ|-j[|-j\|-j]|-j^|-j_g}|(4| q^g df}dgdg  gdg}d6}K|	| |	dh |	|K |	|K |	|K |	|K |	|K d }d }d }||
|di|G |dj| jWj`|G  |dk| jWja|G  || ||j|  || |(D ]}t5|tr||
|| q||j|  q|d |d q dS )oNr   )SummaryViewr	    <c                    s\   d  d| t |  d d   7  < d  d|  d   7  < d  |   7  < d S )Nr   z{: } -)str)paddingZtext_dir)SPACING_SIZEheader_sep_listline_length_listrow_format_listr   r   
add_column  s
   z _build_table.<locals>.add_columnc                 S   s,   | t | }|d }d| | d||   S )Nr	   r   )r}   )r   textZleft_lengthZhalfr   r   r   	add_title  s   z_build_table.<locals>.add_titlec                    s     |    d d S )N
)r?   )s)resultr   r   r?     s   
z_build_table.<locals>.appendr   r   c                 S   s\   | t dkrdS t | }|dkr|d }n|dkr|d }n|dkr&|d }d	d
| |S )z7
        Transform time in ns to time in unit.
        r   r   r   g    eAr   g    .Ausg     @@{}{:.2f}r   )r   format)r   unitindentr   r   r   r   format_time  s   

z!_build_table.<locals>.format_timec                 S   s   d d| | d S )zK
        Transform ratio within [0, 1] to percentage presentation.
        r   r   r   )r   )ratior   r   r   r   format_ratio  s   z"_build_table.<locals>.format_ratioZDevicezUtilization (%)      zDevice SummaryzCPU(Process)zProcess Cpu UtilizationzCPU(System)zSystem Cpu UtilizationzGPU{}a?  Note:
CPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.
CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).
GPU Utilization = Current process GPU time / elapsed time.r   )z
Event TypeCallszCPU Time	Ratio (%)   zOverview SummaryzTime unit: {}.Tr\   c                 S   s   | d S r   r   xr   r   r   rS   l  s    z_build_table.<locals>.<lambda>)keyreversez{})r   z  {})r   r  zGPU Timer  a@  Note:
In this table, We sum up all collected events in terms of event type.
The time of events collected on host are presented as CPU Time, and as GPU Time if on device.
Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.
The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.
Example:
Thread 1:
  Operator: |___________|     |__________|
Thread 2:
  Operator:   |____________|     |___|
After merged:
  Result:   |______________|  |__________|
r   )r   r   r   r   r   z{} / {} / {} / {} / {}z  Othersz{} / - / - / - / {}r   (   r
   )Namer  z&CPU Total / Avg / Max / Min / Ratio(%)&GPU Total / Avg / Max / Min / Ratio(%)   zModel SummaryzNote:
In this table, GPU time is the sum of all device(GPU) events called in the phase.
Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.
)r  z
Total Timer  zDistribution Summaryz  Communicationz  Computationz	  Overlapa  Note:
Communication time: Communication Event time, Communication Op time and its kernel time on gpu.
Computation time: Kernel time, except kernels belong to communication(nccl kernels).
Overlap time: Communication time intersects with computation time.
Example:
Communication:
  CPU:              |_________________|
  GPU:                                  |______________|
  Total:            |_________________| |______________|
Computation time(Kernel):
  GPU:         |________________|
Overlap time:       |___________|
4   zAll threads mergedz
Thread: {}c                 S   
   | d j S r   r"   r  r   r   r   rS   _  rV   c                 S   r  r   r   r  r   r   r   rS   c  rV   c                 S   r  r   r   r  r   r   r   rS   g  rV   c                 S   r  r   r   r  r   r   r   rS   k  rV   )r  c                 S   r  r   r&   r  r   r   r   rS   n  rV   c                 S   r  r   r   r  r   r   r   rS   s  rV   c                 S   r  r   r   r  r   r   r   rS   x  rV   c                 S   r  r   r   r  r   r   r   rS   |  rV   r   z...r   r   z    {}z- / - / - / - / -zOperator Summaryc                 S   r  r   )r   r  r   r   r   rS   9  rV   c                 S   r  r   )r   r  r   r   r   rS   =  rV   c                 S   r  r   )r   r  r   r   r   rS   A  rV   c                 S   r  r   )r$   r  r   r   r   rS   D  rV   )r  r  r  Z   zKernel Summaryz(.+?)(<.*>)(\(.*\))zMemory Manipulation Summaryc                 S   r  r   r  r  r   r   r   rS     rV   c                 S   r  r   r  r  r   r   r   rS     rV   c                 S   r  r   r  r  r   r   r   rS     rV   c                 S   r  r   r  r  r   r   r   rS     rV   c                 S   r  r   r  r  r   r   r   rS     rV   c                 S   r  r   r  r  r   r   r   rS     rV   c                 S   r  r   r  r  r   r   r   rS      rV   c                 S   r  r   r  r  r   r   r   rS     rV   zUserDefined Summaryc                 S   r  r   r   r  r   r   r   rS   Z  rV   c                 S   r  r   r  r  r   r   r   rS   h  rV   )r  TypezAllocation Countz
Free CountzAllocation Sizez	Free SizezIncreased Size2      zMemory Summary - {}zPeak Allocated Memory: {}zPeak Reserved Memory: {})r   )r   r   )r   )bZprofilerr   r   rl   r   r   Z
DeviceViewr   r   r   ri   rk   r0   joinZOverViewr;   r<   rU   updater[   rY   r>   ry   r   rn   ra   rs   r   r   r   r   r   splitr   r   r   r"   r=   rX   r`   ro   rt   sortedZ	ModelViewr}   r$   r   r   r   r   r   r   r?   
isinstanceZDistributedViewrp   rq   rr   ZOperatorViewr   r   r   r   r   r   r   r   r   r   r&   r   r   r   r   r   Z
KernelViewr   recompilematchgroupZMemoryManipulationViewr   ZUDFViewr   r   Z
MemoryViewr   r   r   r   r   r   r   r   r   r   r   )LZstatistic_dataZ	sorted_byZ	op_detailZ
thread_sepZ	time_unitZ	row_limitZmax_src_column_widthZviewsr   r   r   r?   r   r   
total_timeheadersZname_column_widthZDEFAULT_COLUMN_WIDTH_Z
row_formatZ
header_sepline_lengthZ
row_valuesZgpu_namer$   ZutilizationZcpu_type_timeZgpu_type_timeZcpu_call_timesZgpu_call_timesre   valueZevent_type_nameZgpu_time_ranger]   rf   Z
time_rangesorted_itemsr   r   Zall_row_valuesZaccmulation_timeZgpu_accmulation_timeZgpu_total_timer8   itemZ	gpu_ratioZ
other_timeZother_gpu_timeZcalltime_widthZcpu_data_description_widthZgpu_data_description_widthZcommunication_timeZcomputation_timeZoverlap_timer   rC   r>   Ztotal_op_cpu_timeZtotal_op_gpu_timeZ	cpu_ratioZinnerop_nameZinnerop_nodeZdevice_node_namer.   r   Ztotal_kernel_gpu_timeZkernel_name_patternr$  r   r   Zdevice_typeZmemory_eventsr   Zsorted_reserved_itemsZnumber_column_widthr   )r   r   r   r   r   r   _build_table  s2  	










































 













r-  )%r;   enumr   r"  Zpaddle.fluid.corer   r   Zstatistic_helperrz   r   r   ZCudaRuntimer0   ZMemcpyZMemsetr   r   r   r   r   ry   ZPythonOpr   Z_AllTracerEventTyper|   r   r   rI   rM   rO   rP   rm   r   r   r   r   r-  r   r   r   r   <module>   sD   
 4&FJ  M