o
    eEL                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZ eeeejdg d	k rCed
ejejdd G dd dZG dd deZG dd deZdS )    N)perf_counter)Pool	cpu_count)Document   Page)Pages.)r      r   z)PyMuPDF>=1.19.0 is required for pdf2docx.z[%(levelname)s] %(message)s)levelformatc                	   @   sV  e Zd ZdZd9dedefddZedd Zed	d
 Zdd Z	edd Z
d:dededefddZd:dededefddZdd Zdd Zd9ddZdd Zdefd d!Zd"efd#d$Zd"efd%d&Zd;d'ed(ed)ed*efd+d,Zd<d(edededefd-d.Zd:dededefd/d0Zd(ededefd1d2Zed3d4 Zed5d6 Zed7d8 ZdS )=	Convertera  The ``PDF`` to ``docx`` converter.
    
    * Read PDF file with ``PyMuPDF`` to get raw layout data page by page, including text,
      image, drawing and its properties, e.g. boundary box, font, size, image width, height.
    * Analyze layout in document level, e.g. page header, footer and margin.
    * Parse page layout to docx structure, e.g. paragraph and its properties like indentaton, 
      spacing, text alignment; table and its properties like border, shading, merging. 
    * Finally, generate docx with ``python-docx``.
    Npdf_filepasswordc                 C   s,   || _ t|pd| _t|| _t | _dS )zInitialize fitz object with given pdf file path.

        Args:
            pdf_file (str): pdf file path.
            password (str): Password for encrypted pdf. Default to None if not encrypted.
         N)filename_pdfstrr   fitzr   	_fitz_docr	   _pages)selfr   r    r   BD:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/converter.py__init__!   s   zConverter.__init__c                 C      | j S N)r   r   r   r   r   fitz_doc1      zConverter.fitz_docc                 C   r   r   )r   r   r   r   r   pages4   r   zConverter.pagesc                 C   s   | j   d S r   )r   closer   r   r   r   r!   8   s    zConverter.closec                 C   s   i ddddddddddd	d
ddddddddddddddddddddddddddddd dddddddd!S )"zDefault parsing parameters.debugFZocrr   ignore_page_errorTmulti_processingr   Zmin_section_heightg      4@Zconnected_border_toleranceg      ?Zmax_border_widthg      @Zmin_border_clearanceg       @Zfloat_image_ignorable_gapg      @Zpage_margin_factor_topZpage_margin_factor_bottomZshape_min_dimensionZmax_line_spacing_ratiog      ?Zline_overlap_thresholdg?Zline_break_width_ratioZline_break_free_space_ratiog?g333333?g      ?g      @g      .@)Zline_separate_thresholdZnew_paragraph_free_space_ratioZlines_left_aligned_thresholdZlines_right_aligned_thresholdZlines_center_aligned_thresholdZclip_image_res_ratioZmin_svg_gap_dxZmin_svg_gap_dyZ	min_svg_wZ	min_svg_hZextract_stream_tableZparse_lattice_tableZparse_stream_tableZdelete_end_line_hyphenr   r   r   r   r   default_settings;   sd   	
zConverter.default_settingsr   startendr    c                 K   s&   |  |||jdi |jdi |S )a2  Parse pages in three steps:
        * open PDF file with ``PyMuPDF``
        * analyze whole document, e.g. page section, header/footer and margin
        * parse specified pages, e.g. paragraph, image and table

        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes to parse. Defaults to None.
            kwargs (dict, optional): Configuration parameters. 
        Nr   )
load_pagesparse_documentparse_pages)r   r&   r'   r    kwargsr   r   r   parsed   s   zConverter.parsec                 C   s   t | d | jjr#| jstd| j d| j| js#tdt	| j}| j
dd t|D  | ||||}|D ]}d| j
| _q?| S )a  Step 1 of converting process: open PDF file with ``PyMuPDF``, 
        especially for password encrypted file.
        
        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes to parse. Defaults to None.
        z[1/4] Opening document...zRequire password for r
   zIncorrect password.c                 S      g | ]}t |d dqS T)idskip_parsingr   .0ir   r   r   
<listcomp>       z(Converter.load_pages.<locals>.<listcomp>F)logginginfo_color_outputr   Z
needs_passr   ConversionExceptionr   Zauthenticatelenr   resetrange_page_indexesr0   )r   r&   r'   r    numpage_indexesr3   r   r   r   r(   u   s   	
zConverter.load_pagesc                 K   s*   t | d | jj| jfi | | S )zjStep 2 of converting process: analyze whole document, e.g. page section,
        header/footer and margin.z[2/4] Analyzing document...)r6   r7   r8   r   r,   r   )r   r+   r   r   r   r)      s   zConverter.parse_documentc                 K   s   t | d dd | jD }t|}t|ddD ]G\}}|jd }t d||| z
|jdi | W q tya } z |d sM|d rMt 	d	|| n
t
d
| d| W Y d}~qd}~ww | S )zKStep 3 of converting process: parse pages, e.g. paragraph, image and table.z[3/4] Parsing pages...c                 S   s   g | ]}|j s|qS r   )r0   r2   pager   r   r   r4      s    z)Converter.parse_pages.<locals>.<listcomp>r   r&   (%d/%d) Page %dr"   r#   z,Ignore page %d due to parsing page error: %szError when parsing page : Nr   )r6   r7   r8   r   r:   	enumerater/   r,   	Exceptionerrorr9   )r   r+   r    	num_pagesr3   rA   pider   r   r   r*      s    
zConverter.parse_pagesc                 K   s  t | d ttdd | j}|std|p&| jdtd   d}t	j
|r2t	| t }t|}t|dd	D ]H\}}|jsGq?|jd }	t d
|||	 z|| W q? ty }
 z |d ss|d rst d|	|
 n
td|	 d|
 W Y d}
~
q?d}
~
ww || dS )zStep 4 of converting process: create docx file with converted pages.
        
        Args:
            docx_filename (str): docx filename to write to.
            kwargs (dict, optional): Configuration parameters. 
        z[4/4] Creating pages...c                 S   r   r   )	finalized)rA   r   r   r   <lambda>   s    z%Converter.make_docx.<locals>.<lambda>z)No parsed pages. Please parse page first.r   z.pdfz.docxr   rB   rC   r"   r#   z+Ignore page %d due to making page error: %szError when make page rD   N)r6   r7   r8   listfilterr   r9   r   r:   ospathexistsremover   rE   rK   r/   	make_docxrF   rG   MakedocxExceptionsave)r   docx_filenamer+   Zparsed_pagesfilenameZ	docx_filerH   r3   rA   rI   rJ   r   r   r   rS      s0   

zConverter.make_docxc                 C   s(   t j| jt| jdd | jD dS )z"Store parsed pages in dict format.c                 S   s   g | ]	}|j r| qS r   )rK   storer@   r   r   r   r4      s    z#Converter.store.<locals>.<listcomp>)rW   page_cntr    )rO   rP   basenamer   r:   r   r   r   r   r   rX      s   zConverter.storedatac                 C   s^   | j s|dd}| j dd t|D  |dg D ]}|dd}| j | | qdS )	z"Restore pages from parsed results.rY   d   c                 S   r-   r.   r   r1   r   r   r   r4      r5   z%Converter.restore.<locals>.<listcomp>r    r/   N)r   getr;   r<   restore)r   r[   r>   Zraw_pageidxr   r   r   r_      s   zConverter.restorerW   c                 C   sL   t |ddd}|tj|  dd W d   dS 1 sw   Y  dS )z*Write parsed pages to specified JSON file.wzutf-8)encoding   )indentN)openwritejsondumpsrX   )r   rW   fr   r   r   	serialize   s   "zConverter.serializec                 C   sB   t |d}t|}W d   n1 sw   Y  | | dS )z+Load parsed pages from specified JSON file.rN)re   rg   loadr_   )r   rW   ri   r[   r   r   r   deserialize   s   zConverter.deserializer3   rV   	debug_pdflayout_filec                 K   sx   t j| j\}}|st j|d| }|st j|d}|dt |d | j|fd|gi| | 	| dS )a  Parse, create and plot single page for debug purpose.
        
        Args:
            i (int): Page index to convert.
            docx_filename (str): docx filename to write to.
            debug_pdf (str): New pdf file storing layout information. Default to add prefix ``debug_``.
            layout_file (str): New json file storing parsed layout data. Default to ``layout.json``.
        Zdebug_zlayout.jsonT)r"   Z	debug_docZdebug_filenamer    N)
rO   rP   splitr   joinupdater   r   convertrj   )r   r3   rV   rn   ro   r+   rP   rW   r   r   r   
debug_page  s   zConverter.debug_pagec                 K   s   t  }td| j | j}|| |r|d rtd|d r,| j|||fi | n| j|||fi |j	|fi | tdt  |  dS )a  Convert specified PDF pages to docx file.

        Args:
            docx_filename (str, optional): docx filename to write to. Defaults to None.
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes. Defaults to None.
            kwargs (dict, optional): Configuration parameters. Defaults to None.
        
        Refer to :py:meth:`~pdf2docx.converter.Converter.default_settings` for detail of 
        configuration parameters.
        
        .. note::
            Change extension from ``pdf`` to ``docx`` if ``docx_file`` is None.
        
        .. note::
            * ``start`` and ``end`` is counted from zero if ``--zero_based_index=True`` (by default).
            * Start from the first page if ``start`` is omitted.
            * End with the last page if ``end`` is omitted.
        
        .. note::
            ``pages`` has a higher priority than ``start`` and ``end``. ``start`` and ``end`` works only
            if ``pages`` is omitted.

        .. note::
            Multi-processing works only for continuous pages specified by ``start`` and ``end`` only.
        zStart to convert %sr$   zPMulti-processing works for continuous pages specified by "start" and "end" only.zTerminated in %.2fs.N)
r   r6   r7   r   r%   rr   r9   _convert_with_multi_processingr,   rS   )r   rV   r&   r'   r    r+   t0settingsr   r   r   rs     s   
$zConverter.convertc                 K   sV   | j }|| | j|||fi | g }| jD ]}|jr(||jdi | q|S )a  Extract table contents from specified PDF pages.

        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes. Defaults to None.
            kwargs (dict, optional): Configuration parameters. Defaults to None.
        
        Returns:
            list: A list of parsed table content.
        Nr   )r%   rr   r,   r   rK   extendextract_tables)r   r&   r'   r    r+   rw   ZtablesrA   r   r   r   ry   N  s   

zConverter.extract_tablesc           	         s   d rt d t nt  d fddt D }t }|j|d t D ]} d| d}tj|sAq0	| t
| q0j|fi  dS )	zParse and create pages based on page indexes with multi-processing.

        Reference:

            https://pymupdf.readthedocs.io/en/latest/faq.html#multiprocessing
        r   r    c                    s0   g | ]}| j j d | dfqS )-.json)r   r   r1   cpur'   r+   prefixr   r&   r   r   r4   q  s
    z<Converter._convert_with_multi_processing.<locals>.<listcomp>r   rz   r{   N)minr   r<   r   map_parse_pages_per_cpurO   rP   rQ   rm   rR   rS   )	r   rV   r&   r'   r+   Zvectorspoolr3   rW   r   r|   r   ru   g  s   
z(Converter._convert_with_multi_processingc                    s   | \}}}}}}}}t ||}	|	  |pt|	j}t|| t }
t|
| }|
| }|t||k  }|d | t|| d }t|| |
} fddt||D }|	jD ]}d|_qX|D ]}d|	j| _q`|	j	di |j
di || |	  dS )	a  Render a page range of a document.
        
        Args:
            vector (list): A list containing required parameters.
                * 0  : segment number for current process                
                * 1  : count of CPUs
                * 2,3: whole pages range to process
                * 4  : pdf filename
                * 5  : password for encrypted pdf
                * 6  : configuration parameters
                * 7  : json filename storing parsed results
        r   r   c                    s   g | ]} | qS r   r   r1   Zall_indexesr   r   r4         z2Converter._parse_pages_per_cpu.<locals>.<listcomp>TFNr   )r   r(   r:   r   r<   intr   r    r0   r)   r*   rj   r!   )Zvectorr`   r}   srJ   Zpdf_filenamer   r+   Zjson_filenameZcvrH   mnZseg_sizeZseg_fromZseg_tor?   rA   r3   r   r   r   r     s,   

zConverter._parse_pages_per_cpuc                 C   s@   |rdd |D }|S |p|}t t| t|}t|| }|S )zParsing arguments.c                 S   s   g | ]}t |qS r   )r   )r2   xr   r   r   r4     r   z+Converter._page_indexes.<locals>.<listcomp>)slicer   r<   )r&   r'   r    Zpdf_lenZindexesr   r   r   r   r=     s   zConverter._page_indexesc                 C   s   d|  dS )Nz[1;36mz[0mr   )msgr   r   r   r8     s   zConverter._color_outputr   )r   NN)NNN)Nr   NN)__name__
__module____qualname____doc__r   r   propertyr   r    r!   r%   r   rM   r,   r(   r)   r*   rS   rX   dictr_   rj   rm   rt   rs   ry   ru   staticmethodr   r=   r8   r   r   r   r   r      s:    



(	
+	/
.
r   c                   @      e Zd ZdS )r9   Nr   r   r   r   r   r   r   r9         r9   c                   @   r   )rT   Nr   r   r   r   r   rT     r   rT   )rO   rg   r6   timer   multiprocessingr   r   r   Zdocxr   Z	page.Pager   Z
page.Pagesr	   rM   r   r   ZVersionBindrp   
SystemExitbasicConfigINFOr   rF   r9   rT   r   r   r   r   <module>   s*      0