o
    e4                     @   sR   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
 G dd dZdS )	a  Extract images from PDF.

Both raster images and vector graphics are considered:

* Normal images like jpeg or png could be extracted with method ``page.get_text('rawdict')`` 
  and ``Page.get_images()``. Note the process for png images with alpha channel.
* Vector graphics are actually composed of a group of paths, represented by operators like
  ``re``, ``m``, ``l`` and ``c``. They're detected by finding the contours with ``opencv``.
    N   )
Collection)	BlockType)recursive_xy_cutinner_contoursxy_project_profilec                   @   s   e Zd ZdejddfddZd%dejdefd	d
Zd%dejdefddZ	d&defddZ
dedededefddZedejdejfddZedejdefddZedejfddZedejd efd!d"Zedejfd#d$ZdS )'ImagesExtractorpagereturnNc                 C   s
   || _ dS )zwExtract images from PDF page.
        
        Args:
            page (fitz.Page): pdf page to extract images.
        N)_page)selfr	    r   ND:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/image/ImagesExtractor.py__init__   s   
zImagesExtractor.__init__      @bboxzoomc                 C   s`   |  | j |du r| jj}n| jjr|| jj }n|}|| jj@ }t||}| jj||dS )a  Clip page pixmap (without text) according to ``bbox``.

        Args:
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
                Note that ``bbox`` depends on un-rotated page CS, while cliping page is based on
                the final page.
            zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.

        Returns:
            fitz.Pixmap: The extracted pixmap.
        N)Zclipmatrix)_hide_page_textr   rectrotationZrotation_matrixfitzZMatrixZ
get_pixmap)r   r   r   	clip_bboxr   r   r   r   clip_page_to_pixmap   s   
z#ImagesExtractor.clip_page_to_pixmapclip_image_res_ratioc                 C   s   | j ||d}| ||S )ar  Clip page pixmap (without text) according to ``bbox`` and convert to source image.

        Args:
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.

        Returns:
            list: A list of image raw dict.
        )r   r   )r   _to_raw_dict)r   r   r   pixr   r   r   clip_page_to_dictA   s   
z!ImagesExtractor.clip_page_to_dictc                 C   s:  | j j}| j j}t }| j jddD ]-}t|}d|d< | j |}| j j}|D ]}| dkr1q(|	|s7q(|
||f q(qdd }	||	}
g }|
D ]M}t|dkrkt }|D ]\}}||O }q[| ||}n*|d \}}| ||}|j }|r| ||}n| ||}|r| || |d	< |
| qM|S )
a  Extract normal images with ``Page.get_images()``.

        Args:
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.

        Returns:
            list: A list of extracted and recovered image raw dict.
        
        .. note::
            ``Page.get_images()`` contains each image only once, which may less than the real count of images in a page.
        T)fullr      c                 S   s   | d  |d S )Nr   )
intersects)abr   r   r   <lambda>y   s    z0ImagesExtractor.extract_images.<locals>.<lambda>   image)r   parentr   r   Z
get_imageslistZget_image_rectsZcropboxZget_arear!   appendgrouplenr   Rectr   _recover_pixmap
colorspacer   _rotate_image)r   r   docr   ZicitemZrectsZunrotated_page_bboxr   ZfungroupsZimagesr*   r   Zraw_dictr   Z
alpha_onlyr   r   r   extract_imagesO   s>   
zImagesExtractor.extract_imagesmin_svg_gap_dxmin_svg_gap_dymin_wmin_hc              	      sR  ddl }| jdd}| |}|||j}||dd|j\}	 t ||d}
 fdd	|
D }tt	|
|}d
}|rt
|
D ])\}\}}}}t|||||ddf  ||||f }|d| | qB|D ]-\}}|\}}}}||||f||fdd |D ]\}}}}||||f||fdd qqn|d| |d |S )a  Find contour of potential vector graphics.

        Args:
            min_svg_gap_dx (float): Merge svg if the horizontal gap is less than this value.
            min_svg_gap_dy (float): Merge svg if the vertical gap is less than this value.
            min_w (float): Ignore contours if the bbox width is less than this value.
            min_h (float): Ignore contours if the bbox height is less than this value.

        Returns:
            list: A list of potential svg region: (external_bbox, inner_bboxes:list).
        r   N      ?)r         )Zmin_dxZmin_dyc                    s   g | ]	}t  |qS r   )r   ).0r   binaryr7   r6   r   r   
<listcomp>   s    z7ImagesExtractor.detect_svg_contours.<locals>.<listcomp>Fz
sub-image-)r:   r   r   r%   )r   r   r:   img)cv2r   _pixmap_to_cv_imageZcvtColorZCOLOR_BGR2GRAY	thresholdZTHRESH_BINARY_INVr   r(   zip	enumerater   ZimshowZ	rectangleZwaitKey)r   r4   r5   r6   r7   cvpixmapsrcgray_Zexternal_bboxesZgrouped_inner_bboxesr2   debugix0y0x1y1Zarrr   Zinner_bboxesZu0Zv0u1v1r   r<   r   detect_svg_contours   s,   
0
z#ImagesExtractor.detect_svg_contoursr&   c                 C   s    t jjt|| j| j|  dS )zStore Pixmap ``image`` to raw dict.

        Args:
            image (fitz.Pixmap): Pixmap to store.
            bbox (fitz.Rect): Boundary box the pixmap.

        Returns:
            dict: Raw dict of the pixmap.
        )typer   widthheightr&   )r   ZIMAGEvaluetuplerT   rU   tobytes)r&   r   r   r   r   r      s   zImagesExtractor._to_raw_dictrF   r   c                 C   s   ddl }ddl}t| }|jdd \}}|d |d }}d}	|||f||	}
||
d }||
d }t|| ||  }t|| ||  }|
d  |d | 7  < |
d  |d | 7  < |||
||f}|	d	|\}}|
 S )
zRotate image represented by image bytes.

        Args:
            pixmap (fitz.Pixmap): Image to rotate.
            rotation (int): Rotation angle.
        
        Return: image bytes.
        r   Nr   r8   )r   r   )r   r%   )r   r   )r%   r   z.png)r@   numpyr   rA   shapeZgetRotationMatrix2DabsintZ
warpAffineZimencoderX   )rF   r   rE   npr?   hwrL   rM   scaler   cossinWHZrotated_imgrI   Zim_pngr   r   r   r/      s    

zImagesExtractor._rotate_imagec                 C   sb   dd |   D }||   | j}|D ]}||dddddd}||| qd	S )
z$Hide page text before clipping page.c                 S   s   g | ]\}}}}|qS r   r   )r;   xrefnameZinvokerr   r   r   r   r>     s    z3ImagesExtractor._hide_page_text.<locals>.<listcomp>s   BTs   BT 3 Trs   Tms   Tm 3 Trs   Tds   Td 3 TrN)Zget_xobjectsextendZget_contentsr'   Zxref_streamreplaceZupdate_stream)r	   Z	xref_listr0   re   streamr   r   r   r     s   zImagesExtractor._hide_page_textr0   r1   c                 C   s   |d }|d }t | |}|dkr>t | |}|jr%t |d}d}|}|j|jkr8|j|jkr8t ||}ntd| |jrT|jjt j	jt j
jfvrTt t j
|}|S )a&  Restore pixmap with soft mask considered.
        
        References:

            * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList        
            * https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-handle-stencil-masks
            * https://github.com/pymupdf/PyMuPDF/issues/670

        Args:
            doc (fitz.Document): pdf document.
            item (list): image instance of ``page.get_images()``.

        Returns:
            fitz.Pixmap: Recovered pixmap with soft mask considered.
        r   r%   NzCIgnore image due to inconsistent size of color and mask pixmaps: %s)r   PixmapalpharT   rU   loggingwarningr.   rf   ZcsGRAYZcsRGB)r0   r1   xsr   masktempr   r   r   r-   +  s   zImagesExtractor._recover_pixmapc                 C   s0   ddl }ddl}|  }||||j|jS )znConvert fitz Pixmap to opencv image.

        Args:
            pixmap (fitz.Pixmap): PyMuPDF Pixmap.
        r   N)r@   rY   rX   ZimdecodeZ
frombufferZuint8ZIMREAD_COLOR)rF   rE   r]   Zimg_byter   r   r   rA   [  s   z#ImagesExtractor._pixmap_to_cv_image)Nr   )r   )__name__
__module____qualname__r   ZPager   r,   floatr   r   r3   rR   staticmethodrj   r   r\   r/   r   ZDocumentr(   r-   rA   r   r   r   r   r      s     	#N7-/r   )__doc__rl   r   Zcommon.Collectionr   Zcommon.sharer   Zcommon.algorithmr   r   r   r   r   r   r   r   <module>   s   
