o
    e                     @   sx   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	l	m
Z
 dd
lmZmZ ddlmZ G dd deZdS )z+
A wrapper of PyMuPDF Page as page engine.
    N   )RawPage   )ImagesExtractor)Paths)FACTOR_A_HALF)Element)RectType
debug_plot)get_areac                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zed
dd Z	dd Z
dS )RawPageFitzz6A wrapper of ``fitz.Page`` to extract source contents.c                 K   s   i }| j s|S | j j^ }}}|||d ||| _| _| jdi |}||d< | jdi |}|d | | jdi |\}}	||d< |d |	 | 	 }
|d |
 t
| j j |S )N)widthheightblocksshapes )page_enginerectupdater   r   _preprocess_text_preprocess_imagesextend_preprocess_shapes_preprocess_hyperlinksr   Zset_rotation_matrixZrotation_matrix)selfsettingsZraw_dict_whtext_blocksZimage_blocksr   Zimages
hyperlinksr   r   ID:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/page/RawPageFitz.pyextract_raw_dict   s    
zRawPageFitz.extract_raw_dictc                 K   s4  |d }|dkrt d| jjddd}|dg }z| j }W n ty1   td g }Y nw |s6|S |d	kr?d
d }ndd }tt	||}dd }g }	|D ]E}
d}|
d D ]5}|d D ]*}|D ]!}t
|d |d }|||d  tkr|d |d krd} nqd|r nq`|r nqZ|s|	|
 qR|	S )ak  Extract page text and identify hidden text. 
        
        NOTE: All the coordinates are relative to un-rotated page.

            https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
            https://pymupdf.readthedocs.io/en/latest/functions.html#Page.get_texttrace
            https://pymupdf.readthedocs.io/en/latest/textpage.html
        ocrr   z/OCR feature is planned but not implemented yet.Zrawdict@   )flagsr   zJIgnore hidden text checking due to UnicodeDecodeError in upstream library.r   c                 S   s   | d dkS Ntype   r   spanr   r   r!   <lambda>N       z.RawPageFitz._preprocess_text.<locals>.<lambda>c                 S   s   | d dkS r&   r   r)   r   r   r!   r+   P   r,   c                 S   s   | \}}}}|| ||  S )Nr   )bboxZx0Zy0x1y1r   r   r!   	span_areaS   s   z/RawPageFitz._preprocess_text.<locals>.span_areaFlinesspansr-   ZfontT)
SystemExitr   Zget_textgetZget_texttraceSystemErrorloggingwarninglistfilterr   r   append)r   r   r#   rawr   r2   fZfiltered_spansr0   r   blockZintersectedliner*   Zfilter_spanZintersected_arear   r   r!   r   1   sB   	



zRawPageFitz._preprocess_textc                 K   s$   |d dkrg S t | j|d S )aD  Extract image blocks. Image block extracted by ``page.get_text('rawdict')`` doesn't 
        contain alpha channel data, so it has to get page images by ``page.get_images()`` and 
        then recover them. Note that ``Page.get_images()`` contains each image only once, i.e., 
        ignore duplicated occurrences.
        r#   r   clip_image_res_ratio)r   r   Zextract_images)r   r   r   r   r!   r   m   s   zRawPageFitz._preprocess_imagesc                 K   s6   | j di |}||d |d |d |d |d S )zGIdentify iso-oriented paths and convert vector graphic paths to pixmap.Zmin_svg_gap_dxZmin_svg_gap_dyZ	min_svg_wZ	min_svg_hr?   Nr   )_init_pathsZto_shapes_and_images)r   r   pathsr   r   r!   r   y   s   zRawPageFitz._preprocess_shapeszSource Pathsc                 K   s   | j  }t| d|S )z:Initialize Paths based on drawings extracted with PyMuPDF.)parent)r   Zget_cdrawingsr   restore)r   r   Z	raw_pathsr   r   r!   r@      s   
zRawPageFitz._init_pathsc                 C   sH   g }| j  D ]}|d dkrq|tjjt|d |d d q|S )ziGet source hyperlink dicts.

        Returns:
            list: A list of source hyperlink dict.
        kindr   fromuri)r'   r-   rF   )r   Z	get_linksr:   r	   Z	HYPERLINKvaluetuple)r   r    linkr   r   r!   r      s   

z"RawPageFitz._preprocess_hyperlinksN)__name__
__module____qualname____doc__r"   r   r   r   r
   r@   r   r   r   r   r!   r      s    <
r   )rM   r6   r   Zimage.ImagesExtractorr   Zshape.Pathsr   Zcommon.constantsr   Zcommon.Elementr   Zcommon.sharer	   r
   Zcommon.algorithmr   r   r   r   r   r!   <module>   s   