o
    e[                     @   s:   d Z ddlmZ ddlmZ ddlmZ G dd dZdS )aD  Document layout depends on Blocks and Shapes.

**Layout** here refers to the content and position of text, image and table. The target is to convert
source blocks and shapes to a *flow layout* that can be re-created as docx elements like paragraph and
table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout . 
So, detecting and parsing table block is the principle steps.

The prerequite work is done before this step:

1. Clean up source blocks and shapes in Page level, e.g. convert source blocks to ``Line`` level,
   because the block structure determined by ``PyMuPDF`` might be not reasonable.
#. Parse structure in document level, e.g. page header/footer.
#. Parse Section and Column layout in Page level. 

The page layout parsing idea:

1. Parse table layout in Column level.
    (a) Detect explicit tables first based on shapes. 
    (#) Then, detect stream tables based on original text blocks and parsed explicit tables.
    (#) Move table contained blocks (lines or explicit table) to associated cell-layout.
#. Parse paragraph in Column level.
    (a) Detect text blocks by combining related lines.
    (#) Parse paragraph style, e.g. text format, alignment
#. Calculate vertical spacing based on parsed tables and paragraphs.
#. Repeat above steps for cell-layout in parsed table level.
   )Line)	constants)Shapesc                   @   s|   e Zd ZdZdddZdd Zdd Zd	d
 ZdefddZ	de
fddZde
fddZdd Zdd Zdd Zdd ZdS )Layoutz(Blocks and shapes structure and formats.Nc                 C   sD   ddl m } ddlm} ||| d| _t|| d| _|| d| _dS )a5   Initialize layout.

        Args:
            blocks (Blocks): Blocks representing text/table contents.
            shapes (Shapes): Shapes representing table border, shading and text style like underline, highlight.
            parent (Page, Column, Cell): The object that this layout belonging to.
           )Blocksr   )TablesConstructor)Z	instancesparent)r	   N)r   Ztable.TablesConstructorr   blocksr   shapes_table_parser)selfr
   r   r   r    r   FD:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/layout/Layout.py__init__&   s
   zLayout.__init__c                 O      t )zWorking bbox of current Layout.NotImplementedErrorr   argskwargsr   r   r   working_bbox6      zLayout.working_bboxc                 O   r   )z2Whether given element is contained in this layout.r   r   r   r   r   contains;   r   zLayout.containsc                 C   s   | j  | j dS )z#Store parsed layout in dict format.)r
   r   )r
   storer   )r   r   r   r   r   @   s   zLayout.storedatac                 C   s,   | j |dg  | j|dg  | S )z#Restore Layout from parsed results.r
   r   )r
   restoregetr   )r   r   r   r   r   r   H   s   zLayout.restorer
   c                 C   s   |D ]}|  | qdS )a  Add blocks (line or table block) to this layout. 
        
        Args:
            blocks (list): a list of text line or table block to add.
        
        .. note::
            If a text line is partly contained, it must deep into span -> char.
        N)_assign_block)r   r
   blockr   r   r   assign_blocksO   s   	zLayout.assign_blocksr   c                 C   s(   |D ]}| j |jr| j| qdS )zuAdd shapes to this cell. 
        
        Args:
            shapes (list): a list of Shape instance to add.
        N)r   
intersectsbboxr   append)r   r   shaper   r   r   assign_shapes[   s   zLayout.assign_shapesc                 K   sT   | j sdS | jdi | | jdi | tdd | j D ]
}|jdi | qdS )z]Parse layout.

        Args:
            settings (dict): Layout parsing parameters.
        Nc                 S   s   | j S )N)Zis_table_block)er   r   r   <lambda>u   s    zLayout.parse.<locals>.<lambda>r   )r
   _parse_table_parse_paragraphfilterparse)r   settingsr   r   r   r   r+   f   s   
zLayout.parsec                 C   sV   | j |tjdr| j| dS | j|j@ r't|tr)| j|| j dS dS dS )z/Add block (line or table block) to this layout.)	thresholdN)	r   r   ZFACTOR_MAJORr
   r#   r"   
isinstancer   r!   )r   r   r   r   r   r   y   s
   zLayout._assign_blockc                 K   sP   |d r| j |d |d |d  |d r&| j |d |d |d  dS dS )a  Parse table layout: 
        
        * detect explicit tables first based on shapes, 
        * then stream tables based on original text blocks and parsed explicit tables;
        * move table contained blocks (text block or explicit table) to associated cell layout.
        Zparse_lattice_tableZconnected_border_toleranceZmin_border_clearanceZmax_border_widthZparse_stream_tableline_separate_thresholdN)r   Zlattice_tablesZstream_tablesr   r,   r   r   r   r(      s   zLayout._parse_tablec              	   K   sd   | j |d |d |d  | j | jj|d  | j |d |d |d |d |d |d	  d
S )zmCreate text block based on lines, and parse text format, e.g. text highlight, 
        paragraph indentation Zmax_line_spacing_ratioZline_break_free_space_ratioZnew_paragraph_free_space_ratioZdelete_end_line_hyphenr/   Zline_break_width_ratioZlines_left_aligned_thresholdZlines_right_aligned_thresholdZlines_center_aligned_thresholdN)r
   Zparse_blockZparse_text_formatr   Ztext_style_shapesZparse_spacingr0   r   r   r   r)      s"   zLayout._parse_paragraph)NN)__name__
__module____qualname____doc__r   r   r   r   dictr   listr    r%   r+   r   r(   r)   r   r   r   r   r   #   s    
r   N)r4   Z	text.Liner   commonr   Zshape.Shapesr   r   r   r   r   r   <module>   s
   