o
    e&                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ G dd deeZdS )zA wrapper of pdf page engine (e.g. PyMuPDF, pdfminer) to do the following work:

* extract source contents
* clean up blocks/shapes, e.g. elements out of page
* calculate page margin
* parse page structure roughly, i.e. section and column
   )BasePage   )Layout)Section)Column)	Hyperlink)Shapes)Fonts)TextSpan)
debug_plot)	constants)
Collectionc                	       s   e Zd ZdZdddZdd Zedd Zed	d
 Ze	d fddZ
e	ddd ZdefddZdd Zdd ZededededefddZ  ZS )RawPagezA wrapper of page engine.Nc                 C   s   t |  t|  || _dS )zk Initialize page layout.
        
        Args:
            page_engine (Object): Source pdf page.
        N)r   __init__r   page_engine)selfr    r   ED:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/page/RawPage.pyr      s   


zRawPage.__init__c                 K   s   t )a*  Extract source data with page engine. Return a dict with the following structure:
        ```
            {
                "width" : w,
                "height": h,    
                "blocks": [{...}, {...}, ...],
                "shapes" : [{...}, {...}, ...]
            }
        ```
        )NotImplementedErrorr   settingsr   r   r   extract_raw_dict&   s   zRawPage.extract_raw_dictc                 C      d dd | jD S )zxAll extracted text in this page, with images considered as ``<image>``. 
        Should be run after ``restore()`` data.
c                 S      g | ]}|j qS r   )text.0blockr   r   r   
<listcomp>7       z RawPage.text.<locals>.<listcomp>joinblocksr   r   r   r   r   3   s   zRawPage.textc                 C   r   )zKExtracted raw text in current page. Should be run after ``restore()`` data.r   c                 S   r   r   )raw_textr   r   r   r   r   <   r    z$RawPage.raw_text.<locals>.<listcomp>r!   r$   r   r   r   r%   9   s   zRawPage.raw_textzSource Text Blocksc                    s"   | j di |}t | | jS )z-Initialize layout extracted with ``PyMuPDF``.Nr   )r   superrestorer#   )r   r   Zraw_dict	__class__r   r   r'   ?   s   zRawPage.restorezCleaned Shapesc                 K   s2   | j |d |d  | j|d |d  | jS )zClean up raw blocks and shapes, e.g. 
        
        * remove negative or duplicated instances,
        * detect semantic type of shapes
        Zfloat_image_ignorable_gapZline_overlap_thresholdZmax_border_widthZshape_min_dimension)r#   clean_upshapesr   r   r   r   r*   G   s   zRawPage.clean_upfontsc                 C   sb   g }| j D ]}|dd |jD  q|D ]}||j}|s q|j|_|jr.|j|j |_qdS )zUpdate font properties, e.g. font name, font line height ratio, of ``TextSpan``.
        
        Args:
            fonts (Fonts): Fonts parsed by ``fonttools``.
        c                 S   s   g | ]	}t |tr|qS r   )
isinstancer
   )r   spanr   r   r   r   d       z(RawPage.process_font.<locals>.<listcomp>N)r#   extendspansgetfontnameline_heightsize)r   r,   r1   liner.   r3   r   r   r   process_font[   s   
zRawPage.process_fontc                 K   s   t dd | jD }| js|stjfd S | j\}}}}| jj|jB \}}}	}
t|| d}t||	 tj d}t|| d}t||
 d}||d 9 }||d 9 }ttjt	|dttjt	|dttjt	|dttjt	|dfS )zCalculate page margin.

        .. note::
            Ensure this method is run right after cleaning up the layout, so the page margin is 
            calculated based on valid layout, and stay constant.
        c                 S   s   g | ]	}t |ts|qS r   )r-   r   )r   shaper   r   r   r   z   r/   z,RawPage.calculate_margin.<locals>.<listcomp>   g        Zpage_margin_factor_topZpage_margin_factor_bottomr   )
r   r+   r#   r   ZITPbboxmaxZ
MINOR_DISTminround)r   r   r+   x0y0x1y1u0v0u1v1leftrighttopbottomr   r   r   calculate_marginq   s   	zRawPage.calculate_marginc                    s  j \ }}t }|j |j |sdS t }g  fdd}d}|}| D ]}	|	 }
t|
}|dkrAd}nN|dkr|
d j\}}}}|
d j\}}}}|| d }|  | }}|| || }}d}d| ||   kr|krn n|| dk s|| dk rd}|dkr|dkr| }
|
d jd }|	jd |ks|	jd |krd}n/|j\}}}}|| |d	 k rd}n|dkr|dkrt|}||	 t|jd
ddkrd}||kr|||| rd d jd }t|	}|}q0||	 q0|||| S )zDetect and create page sections.

        .. note::
            - Only two-columns Sections are considered for now.
            - Page margin must be parsed before this step.
        Nc                    sp   r#d j |   krdkr#n nd d }|| || d S | | f|}|r6| d S d S )Nr       )Znum_colsZ
union_bboxadd_elements_create_sectionappend)num_colelementsy_refcolumnsectionX0X1sectionsr   r   r   close_section   s   "
z,RawPage.parse_section.<locals>.close_sectionr   r   rM          @gQ?Zmin_section_heightF)sortedrL      )	Zworking_bboxr   r0   r#   r+   Zgroup_by_rowsgroup_by_columnslenr;   )r   r   ZY0_rR   linesrZ   Zpre_num_colrS   rowcolsZcurrent_num_colrC   rD   rE   rF   m0n0m1n1r?   c1c2Zw1Zw2fposr@   rA   rB   combiner   rV   r   parse_section   s\   8
zRawPage.parse_sectionrQ   rR   h_rangerS   c                 C   s   |sdS |\}}| dkr.|j \}}}}	t ||||	f}
|
| td|
gd}|| }nF| }|d j \}}}}|d j \}}}}|| d }t ||||f}|| t ||||f}|| td||gd}|| }t|d|_|S )zQCreate section based on column count, candidate elements and horizontal boundary.Nr   rM   )spacecolumnsr[   )r;   r   Zupdate_bboxrN   r   r^   r>   before_space)rQ   rR   rn   rS   rW   rX   r?   r@   rA   rB   rT   rU   rq   rc   rC   rD   rE   rF   rd   re   rf   rg   uZcolumn_1Zcolumn_2r   r   r   rO      s(   



zRawPage._create_section)N)__name__
__module____qualname____doc__r   r   propertyr   r%   r   r'   r*   r	   r8   rK   rm   staticmethodintr   tuplefloatrO   __classcell__r   r   r(   r   r      s"    



#e$r   N)rv   r   Zlayout.Layoutr   Zlayout.Sectionr   Zlayout.Columnr   Zshape.Shaper   Zshape.Shapesr   Z
font.Fontsr	   Ztext.TextSpanr
   Zcommon.sharer   commonr   Zcommon.Collectionr   r   r   r   r   r   <module>   s   