o
    eÀ=  ã                   @   sˆ   d Z ddlZddlmZmZ ddlmZ ddlmZ ddlm	Z	 dd	l
mZ dd
lmZ ddlmZmZ ddlmZ G dd„ de	ƒZdS )a—  Text Span object based on PDF raw dict extracted with ``PyMuPDF``.

Data structure for Span refer to 
this `link <https://pymupdf.readthedocs.io/en/latest/textpage.html>`_::

    {
        # raw dict
        ---------------------------
        'bbox': (x0,y0,x1,y1),
        'color': sRGB
        'font': fontname,
        'size': fontsize,
        'flags': fontflags,
        'chars': [ chars ],

        # added dict
        ----------------------------
        'text': text,
        'style': [
            {
                'type': int,
                'color': int,
                'uri': str    # for hyperlink
            },
            ...
        ]
    }
é    N)ÚPtÚRGBColor)Úqné   ©ÚCharé   )ÚElement)ÚRectType)Ú	constants)ÚshareÚdocx)ÚShapec                       sà   e Zd ZdZd)def‡ fdd„Zedd„ ƒZejdd„ ƒZd	d
„ Z	edd„ ƒZ
defdd„Zdefdd„Zdd„ Zdd„ Z‡ fdd„Zdef‡ fdd„Zd*dedefdd „Zd*dedefd!d"„Zd#d$„ Zd%d&„ Zd'd(„ Z‡  ZS )+ÚTextSpanzObject representing text span.NÚrawc                    sè   |pi }|  dd¡| _|  dd¡| _dd„ |  dg ¡D ƒ}dd„ |D ƒ| _|  dd	¡| _|  d
d	¡| _|  dd¡| _|  dd¡| _|  dd¡| _|  dd¡| _	|  dg ¡| _
|  dd¡| _tƒ  |¡ d| j ¡ v rr|  tj¡ d S d S )NÚcolorr   Úflagsc                 S   s   g | ]}t |ƒ‘qS © r   )Ú.0Úcr   r   úFD:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/text/TextSpan.pyÚ
<listcomp>3   s    z%TextSpan.__init__.<locals>.<listcomp>Úcharsc                 S   s   g | ]	}|j d kr|‘qS )Ú ©r   ©r   Úcharr   r   r   r   4   s    Útextr   ÚfontÚsizeg      (@Úascenderç      ð?Ú	descenderç        Úline_heightéÿÿÿÿÚstyleÚchar_spacingZUNNAMED)Úgetr   r   r   Ú_textr   r   r    r"   r$   r&   r'   ÚsuperÚ__init__ÚupperÚ_change_font_and_update_bboxr   ZDEFAULT_FONT_NAME)Úselfr   r   ©Ú	__class__r   r   r+   -   s"   ÿzTextSpan.__init__c                 C   s"   | j rd dd„ | j D ƒ¡S | jS )z:Get span text. Note joining chars is in a higher priority.r   c                 S   s   g | ]}|j ‘qS r   r   r   r   r   r   r   T   s    z!TextSpan.text.<locals>.<listcomp>)r   Újoinr)   ©r.   r   r   r   r   Q   s   "zTextSpan.textc                 C   s
   || _ dS )zLSet span text directly in case no chars are stores, e.g. restored from json.N)r)   )r.   Úvaluer   r   r   r   V   s   
c                 C   s"   t  ¡ }| jD ]}||jO }q|S )z,Calculate bbox based on contained instances.)ÚfitzZRectr   Úbbox)r.   r5   r   r   r   r   Úcal_bbox[   s   zTextSpan.cal_bboxc                 C   s
   | j dkS )Nr%   )r$   r2   r   r   r   Úis_valid_line_heighta   s   
zTextSpan.is_valid_line_heightÚ	font_namec                 C   sì   || _ t |¡}|j| j| jd}|| jjkr"|  j| jj| 9  _| j\}}}}t dd||f¡}|j	| j
d j| j|| jd\}	}
|	j| j d }|	j| }|	j| }|  ||||f¡ | j
D ]}|j\}}
}}
| ||||f¡ qadS )a  Set new font, and update font size, span/char bbox accordingly.

        It's generally used for span with unnamed fonts. 
        See this `issue <https://github.com/pymupdf/PyMuPDF/issues/642>`_.        

        In corner case, where the PDF file containing unnamed and not embedded fonts, the span bbox
        extracted from ``PyMuPDF`` is not correct. ``PyMuPDF`` provides feature to replace these 
        unnamed fonts with specified fonts, then extract correct bbox from the updated PDF. Since we 
        care less about the original PDF itself but its layout, the idea here is to set a default font 
        for text spans with unnamed fonts, and estimate the updated bbox with method from 
        ``fitz.TextWriter``.

        Args:
            font_name (str): Font name.
        )Úfontsizer   )r   r9   ç       @N)r   r4   ZFontÚtext_lengthr   r   r5   ÚwidthZ
TextWriterÚappendr   ÚoriginÚheightÚy0Úy1Úupdate_bbox)r.   r8   r   Ú
new_lengthÚx0r@   Úx1rA   ZtwÚrectÚ_Zbuffr   r   r   r   r-   e   s*   


ü



þz%TextSpan._change_font_and_update_bboxr   c                 C   s   | j  |¡ |  |¡ dS )z%Add char and update bbox accordingly.N)r   r=   Ú
union_bbox)r.   r   r   r   r   Úadd–   s   zTextSpan.addc                 C   sP   | j }| d¡s
dS t|ƒt| ¡ ƒ }| j|d d… | _| j|  ¡ d dS )z3Remove blanks at the left side, but keep one blank.ú  Fr   N©rF   T)r   Ú
startswithÚlenÚlstripr   rB   r6   ©r.   Zoriginal_textZ
num_blanksr   r   r   rN   œ   ó   zTextSpan.lstripc                 C   sP   | j }| d¡s
dS t|ƒt| ¡ ƒ }| jdd| … | _| j|  ¡ d dS )z4Remove blanks at the right side, but keep one blank.rJ   FNr   rK   T)r   ÚendswithrM   Úrstripr   rB   r6   rO   r   r   r   rR   ¨   rP   zTextSpan.rstripc                    s:   t ƒ  ¡ }| | j| j| j| j| j| j| j	| j
dœ¡ |S )N)r   r   r   r$   r   r   r&   r'   )r*   ÚstoreÚupdater   r   r   r$   r   r   r&   r'   )r.   Úresr/   r   r   rS   ´   s   
ø
zTextSpan.storer   c                    s   t ƒ j|||dd d S )Nr   )ÚstrokeÚfillr<   )r*   Úplot)r.   Úpager   r/   r   r   rX   Ã   s    zTextSpan.plotTrF   Ú
horizontalc                    sÄ  ˆj | j @ }|jr| gS g }ˆ r| j j|_| j j|_n
| j j|_| j j|_‡ ‡fdd„}tt|t| j	ƒƒƒ}|r>|d d nd}t
|ƒ}t|| dƒ}	|dkr€ˆ r_| j j| j j|j| j jf}
n| j j|j| j j| j jf}
|  ¡  |
¡}| j	d|… |_	| |¡ |dkr¨|j|j|j|jf}
|  ¡  |
¡}| j	||	… |_	| ˆˆ ¡ | |¡ |	t
| j	ƒk ràˆ r¿|j| j j| j j| j jf}
n| j j| j j| j j|jf}
|  ¡  |
¡}| j	|	d… |_	| |¡ |S )a  Split span with the intersection: span-intersection-span.

        Args:
            rect (Shape): Target shape to split this text span.
            horizontal (bool, optional): Text direction. Defaults to True.

        Returns:
            list: Split text spans.
        c                    s   | d   ˆˆ ¡S )Nr   )Zcontained_in_rect)Úitems©rZ   rF   r   r   Ú<lambda>ê   s    z TextSpan.split.<locals>.<lambda>r   r%   N)r5   Zis_emptyr@   rA   rD   rE   ÚlistÚfilterÚ	enumerater   rM   ÚmaxÚcopyrB   r=   Ú_parse_text_format)r.   rF   rZ   ZintsecZsplit_spansÚfZindex_charsÚposÚlengthZpos_endr5   Z
split_spanr   r\   r   ÚsplitÆ   sD   





zTextSpan.splitc                 C   s8  |  tj¡s|  tj¡rdS |  tj¡r#| j |j|j|j	dœ¡ dS |r'dnd}|j
|d  |j
|  }| j
|d  | j
|  }t| j
|d  |j
|  ƒ}|d| kri|jt d¡krh|  |tj¡rhtj|_n|d	| krttj|_nd
| |  k r‚d| k rˆn ntj|_|jsdS |j|jdœ}| j |¡ dS )a=  Parse text style based on the position to a rect shape.

        Args:
            rect (Shape): Target rect shape reprenting potential text style.
            horizontal (bool, optional): Horizontal text direction. Defaults to True.

        Returns:
            bool: Parsed text style successfully or not.
        F)Útyper   ÚuriTr   r   r   g      à?)r   r   r   g      Ð?gffffffÖ?g      è?)rh   r   )Zequal_to_typer
   ZBORDERZSHADINGÚ	HYPERLINKr&   r=   rh   r   ri   r5   Úabsr   Z	rgb_valueÚget_main_bboxr   ZFACTOR_MAJORÚ	HIGHLIGHTÚ	UNDERLINEÚSTRIKEZis_determined)r.   rF   rZ   ÚidxZh_rectZh_spanÚdr&   r   r   r   rc     s6   ý€
 
þzTextSpan._parse_text_formatc                 C   sv   |  | j¡r
|  ¡ S | | j¡stƒ S |  ¡ }|j ¡  | d¡ | jD ]}| |t	j
¡r8|j |¡ | |¡ q$|S )z‰Create new TextSpan object with chars contained in given bbox.
        
        Args:
            rect (fitz.Rect): Target bbox.
        )r#   r#   r#   r#   )Úcontainsr5   rb   Ú
intersectsr   r   ÚclearrB   rl   r   ZFACTOR_A_HALFr=   rH   )r.   rF   Úspanr   r   r   r   rs   P  s   



€zTextSpan.intersectsc                 C   sp   | j D ]}|d tjjkr| j ¡ rt ||d | j¡} nq| | j¡}|  	|¡ | j
r6t || j
¡ dS dS )a€  Add text span to a docx paragraph, and set text style, e.g. font, color, underline, hyperlink, etc.

        .. note::
            Hyperlink and its style is parsed separately from pdf. For instance, regarding a general hyperlink with an
            underline, the text and uri is parsed as hyperlink itself, while the underline is treated as a normal text
            style.
        rh   ri   N)r&   r
   rj   r3   r   Ústripr   Zadd_hyperlinkZadd_runÚ_set_text_formatr'   Zset_char_spacing)r.   Z	paragraphr&   Údocx_runr   r   r   Ú	make_docxk  s   
	€
ÿzTextSpan.make_docxc                 C   s.  t | jd@ ƒ|_t | jd@ ƒ|_t | jd@ ƒ|_| j}||j_|jjj	 
tdƒ|¡ tt | j¡Ž |jj_t| jd ƒd }t|ƒ|j_| j| }t|d ƒdkrWt ||¡ | jD ]:}|d }|tjjkrot ||d	 ¡ qZ|tjjkrŠ| j|d	 krd
|j_qZt ||d	 ¡ qZ|tjjkr”d
|j_ qZdS )z/Set text format for ``python-docx.run`` object.r   r   é   z
w:eastAsiar:   r!   g{®Gáz„?rh   r   TN)!Úboolr   ZsuperscriptÚitalicÚboldr   ÚnameZ_elementZrPrZrFontsÚsetr   r   r   Zrgb_componentr   ÚrgbÚroundr   r   rk   r   Zset_char_scalingr&   r
   rm   r3   Zset_char_shadingrn   Ú	underlineZset_char_underlinero   Ústrike)r.   rx   r8   Z	font_sizeÚscaler&   Útr   r   r   rw   ƒ  s0   


€îzTextSpan._set_text_format)N)T)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Údictr+   Úpropertyr   Úsetterr6   r7   Ústrr-   r   rI   rN   rR   rS   ÚtuplerX   r   r{   rg   rc   rs   ry   rw   Ú__classcell__r   r   r/   r   r   +   s*    $


1M=r   )r‰   r4   Zdocx.sharedr   r   Zdocx.oxml.nsr   r   Zcommon.Elementr	   Zcommon.sharer
   Úcommonr   r   r   Zshape.Shaper   r   r   r   r   r   Ú<module>   s   