o
    e+H                     @   s   d Z ddlmZmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ dd	lmZ dd
l	mZmZ ddlmZ ddlmZ G dd deZdS )aI  Text block objects based on PDF raw dict extracted with ``PyMuPDF``.

Data structure based on this `link <https://pymupdf.readthedocs.io/en/latest/textpage.html>`_::

    {
        # raw dict
        # --------------------------------
        'type': 0,
        'bbox': (x0,y0,x1,y1),
        'lines': [ lines ]

        # introduced dict
        # --------------------------------
        'before_space': bs,
        'after_space': as,
        'line_space': ls,

        'alignment': 0,
        'left_space': 10.0,
        'right_space': 0.0,

        'tab_stops': [15.4, 35.0]
    }
    )PtInches)WD_ALIGN_PARAGRAPH   )Lines   )	ImageSpan)RectTypeTextAlignmentlower_round)Block)rgb_component_from_namer   )	constants)docxc                       s   e Zd ZdZd+def fddZedd Zedd	 Zed
d Z	edd Z
edd Zedd Z fddZdd Z fddZdd Zdedededededefd d!Zd"d# Zd$d% Zd&d' Zd(ededededef
d)d*Z  ZS ),	TextBlockzText block.Nrawc                    sL   |pi }d|v r| d t | t| d|dg | _|   d S )Nbbox)parentlines)popsuper__init__r   restoregetr   Zset_text_block)selfr   	__class__ GD:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/text/TextBlock.pyr   )   s
   zTextBlock.__init__c                 C      dd | j D }d|S )zJText content in block. Note image is counted as a placeholder ``<image>``.c                 S      g | ]}|j qS r   )text.0liner   r   r   
<listcomp>:       z"TextBlock.text.<locals>.<listcomp> r   joinr   Z
lines_textr   r   r   r!   7      
zTextBlock.textc                 C   r   )z5Raw text content in block without considering images.c                 S   r    r   )raw_textr"   r   r   r   r%   @   r&   z&TextBlock.raw_text.<locals>.<listcomp>r'   r(   r*   r   r   r   r,   =   r+   zTextBlock.raw_textc                 C      t dd | jD S )zZIf this block contains only white space or not. If True, this block is safe to be removed.c                 s       | ]}|j V  qd S N)white_space_onlyr"   r   r   r   	<genexpr>F       z-TextBlock.white_space_only.<locals>.<genexpr>)allr   r   r   r   r   r0   C   s   zTextBlock.white_space_onlyc                 C   s   | j jS )zsAll lines contained in text block must have same text direction. 
        Otherwise, set normal direction.
        )r   text_directionr4   r   r   r   r5   H   s   zTextBlock.text_directionc                    sn   | j rdnd | j }t|}|dkrdS | j d  | j   } fdd}tt||}|| |d  S )z4Average distance between adjacent two physical rows.r   r   Nr   c                       t  fdd| D S )Nc                 3   s,    | ]}t |j d   |j   V  qdS )r   N)absr   r"   idxr   r   r1   [   s   * z>TextBlock.average_row_gap.<locals>.<lambda>.<locals>.<genexpr>maxrowr8   r   r   <lambda>[       z+TextBlock.average_row_gap.<locals>.<lambda>)is_horizontal_textr   group_by_physical_rowslenr   summap)r   rowsnumblock_heightZf_max_row_heightZsum_row_heightr   r8   r   average_row_gapO   s   
zTextBlock.average_row_gapc                 C   s   t | j S )zCount of physical rows.)rB   r   rA   r4   r   r   r   	row_count`   s   zTextBlock.row_countc                    s"   t   }|d| j i |S )Nr   )r   storeupdater   )r   resr   r   r   rJ   f   s
   

zTextBlock.storec                 C   s:   t |tttfr|D ]}| j| q
dS | j| dS )zAdd line or lines to TextBlock.N)
isinstancer   listtupler   append)r   Zline_or_linesr$   r   r   r   addn   s
   zTextBlock.addc                    sb   t d}t j||dd | jD ]}t d}|j||d |jD ]}t d}|j||d q qdS )	ztPlot block/line/span area for debug purpose.
        
        Args:
            page (fitz.Page): pdf page.
        bluez[3.0 3.0] 0)strokedashesred)rS   r'   )colorN)r   r   plotr   spans)r   pagerR   r$   rU   spancr   r   r   rW   w   s   

zTextBlock.plotc                 C   sF   d}|D ]}| tjs|jrq| j|jsq| j|r d}q|S )zParse text format with style represented by rectangles.
        
        Args:
            shapes (Shapes): Shapes representing potential styles applied on blocks.
        FT)Zequal_to_typer	   Z	HYPERLINKZis_determinedr   Z
intersectsr   parse_text_format)r   Zshapesflagshaper   r   r   r\      s   zTextBlock.parse_text_formatline_separate_thresholdline_break_width_ratioline_break_free_space_ratiolines_left_aligned_thresholdlines_right_aligned_thresholdlines_center_aligned_thresholdc                 C   s   | j rdnd\}}	}
| |||	|
f||||| _| jtjkr(tj| _| j| | j}|dkr9| jtjkr9d| _	n|dkrG| jtj
krGd| _n|dkrW| jtjkrWd| _d| _	| j||| dS )a   Set horizontal spacing based on lines layout and page bbox.
        
        * The general spacing is determined by paragraph alignment and indentation.
        * The detailed spacing of block lines is determined by tab stops.

        Multiple alignment modes may exist in block (due to improper organized lines
        from ``PyMuPDF``), e.g. some lines align left, and others right. In this case,
        **LEFT** alignment is set, and use ``TAB`` to position each line.
        )r   r   g      ?)   r   g      r   r   N)r@   _parse_alignment	alignmentr
   NONELEFTr   Zparse_tab_stoprI   right_spaceRIGHT
left_spaceCENTERZparse_line_break)r   r   r_   r`   ra   rb   rc   rd   idx0idx1frI   r   r   r   parse_horizontal_spacing   s.   z"TextBlock.parse_horizontal_spacingc                    s   | j D ]}tdd |jD rtj| _ dS q| jrdnd}| j|d  | j|  }| j  }dd   fd	dt	fd
d|D }|| }t
|dkrTt|tj}|| _dS )al  Calculate relative line spacing, e.g. `spacing = 1.02`.  Relative line spacing is based on standard 
        single line height, which is font-related. 

        .. note::
            The line spacing could be updated automatically when changing the font size, while the layout might
            be broken in exact spacing mode, e.g. overlapping of lines.
        c                 s   s    | ]
}t |tr|V  qd S r/   )rM   r   r#   rZ   r   r   r   r1      s    z8TextBlock.parse_relative_line_spacing.<locals>.<genexpr>Nr   r   r   c                 S   r-   )Nc                 s   r.   r/   )line_heightrr   r   r   r   r1      r2   JTextBlock.parse_relative_line_spacing.<locals>.<lambda>.<locals>.<genexpr>)r;   rX   )r$   r   r   r   r>      s    z7TextBlock.parse_relative_line_spacing.<locals>.<lambda>c                    r6   )Nc                 3       | ]} |V  qd S r/   r   r"   fun_max_line_heightr   r   r1          rt   r:   r<   rv   r   r   r>      r?   c                 3   ru   r/   r   )r#   r=   )fun_max_row_heightr   r   r1      rx   )r   rN   rX   r   ZDEFULT_LINE_SPACING
line_spacer@   r   rA   rC   rB   r;   )r   r$   r9   rG   rE   Zstandard_heightrz   r   )rw   ry   r   parse_relative_line_spacing   s   
	

z%TextBlock.parse_relative_line_spacingc                 C   s   | j rdnd}| jd j}||d  ||  }| j|d  | j|  }| j}|dkr3|| |d  }n|}|| _|  j|| 7  _| jdk rU|  j| j| 7  _d| _dS dS )a0  Calculate exact line spacing, e.g. `spacing = Pt(12)`. 

        The layout of pdf text block: line-space-line-space-line, excepting space before first line, 
        i.e. space-line-space-line, when creating paragraph in docx. So, an average line height is 
        ``space+line``. Then, the height of first line can be adjusted by updating paragraph before-spacing.

        .. note::
            Compared with the relative spacing mode, it has a more precise layout, but less flexible editing
            ability, especially changing the font size.
        r   r   r           N)r@   r   r   rI   rz   before_space)r   r9   r   Zfirst_line_heightrG   countrz   r   r   r   parse_exact_line_spacing   s   

z"TextBlock.parse_exact_line_spacingc           	      C   s  t |}tt| jdd}tt| jdd}t||_t||_| j	dkr0tt| j
d|_nt| j
d|_| j}| jdk rD|| j8 }t||_t| j|_t| j|_| jtjkrtj|_| jD ]}|jt| j|  qbt| jtj d}t||_nB| jtjkrtj|_t|tj d}t||_n*| jtjkrtj|_t|tj d}t||_t| jtj d}t||_ntj|_| jD ]}| | q|S )a  Create paragraph for a text block.

        Refer to ``python-docx`` doc for details on text format:

        * https://python-docx.readthedocs.io/en/latest/user/text.html
        * https://python-docx.readthedocs.io/en/latest/api/enum/WdAlignParagraph.html#wdparagraphalignment
        
        Args:
            p (Paragraph): ``python-docx`` paragraph instance.

        .. note::
            The left position of paragraph is set by paragraph indent, rather than ``TAB`` stop.
        r   r|   r   r   )!r   Zreset_paragraph_formatr;   roundr}   Zafter_spacer   Zspace_beforeZspace_afterZline_space_typerz   Zline_spacingrl   first_line_spaceZleft_indentrj   Zright_indentZfirst_line_indentrg   r
   ri   r   Z	tab_stopsZadd_tab_stopr   r   ZITPr   rk   rm   JUSTIFYr   	make_docx)	r   ppfZbefore_spacingZafter_spacingrl   posdr$   r   r   r   r      sB   








zTextBlock.make_docxtext_direction_paramc                    sL  |\t | j |   dt | | j   d}t | d dtdt|d}t| |   | _|| _| j }|D ]$tdkrTqKfddt	dtD }	t
|	rotj  S qK fdd}
t|dkr|
 S fdd|D }fd	d|D }d
d t||D }t|dkr|dd |dd }}tt|t| |k}tt|t| |k}tt|t| k}|r|rt|dkrtjn|
 }n|rtj}n|rtj}n|rt|dkrtjntj}ntj}|tjks|tjkr$|d d j |d d j  | _|S )a  Detect text alignment mode based on layout of internal lines. It can't decide when only
        one line, in such case, the alignment mode is determined by externally check.
        
        Args:
            text_direction_param (tuple): ``(x0_index, x1_index, direction_factor)``, 
                e.g. ``(0, 2, 1)`` for horizontal text, while ``(3, 1, -1)`` for vertical text.
        r          @r|   c                    s4   g | ]}| j  |d   j     kqS )r   r   )r#   i)rp   rn   ro   r_   r=   r   r   r%     s    ,z.TextBlock._parse_alignment.<locals>.<listcomp>c                      s*   t k r	tjS d  krtjS tjS )Ng      ?)r7   r
   rm   ri   rk   r   )Wd_centerd_leftrd   r   r   external_alignment  s
   z6TextBlock._parse_alignment.<locals>.external_alignmentc                       g | ]	}|d  j   qS )r   r   r#   r   )rn   r   r   r%         c                    r   )r   r   )ro   r   r   r%     r   c                 S   s   g | ]
\}}|| d  qS )r   r   )r#   Zx0x1r   r   r   r%     s    re   Nr   r   )r   r   r;   r7   rl   rj   r   rA   rB   rangeanyr
   rh   zipminr   rm   ri   rk   r   )r   r   r   r_   rb   rc   rd   Zd_rightrE   disr   ZX0ZX1XZleft_alignedZright_alignedZcenter_alignedrg   r   )	r   r   r   rp   rn   ro   r_   rd   r=   r   rf   t  sN   





&&zTextBlock._parse_alignmentr/   )__name__
__module____qualname____doc__dictr   propertyr!   r,   r0   r5   rH   rI   rJ   rQ   rW   r\   floatrq   r{   r   r   rO   rf   __classcell__r   r   r   r   r   '   s\    





	
5&Tr   N)r   Zdocx.sharedr   r   Zdocx.enum.textr   r   Zimage.ImageSpanr   Zcommon.sharer	   r
   r   Zcommon.Blockr   r   commonr   r   r   r   r   r   r   <module>   s   