o
    el&                     @   sh   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lm	Z	 dd
l
mZ G dd deZdS )zA group of Line objects.
    N   )Line)TextSpan   )	ImageSpan)ElementCollection)TextAlignment)	constantsc                   @   s~   e Zd ZdZedd ZdefddZedd Zd	e	d
e	fddZ
defddZdd Zde	d	e	fddZde	fddZdS )LineszCollection of text lines.c                    s.   t | sdS | jd  t fdd| jD S )z-Whether all contained lines have same parant.Fr   c                 3   s    | ]}|  V  qd S N)Zsame_source_parent).0line
first_line CD:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/text/Lines.py	<genexpr>   s    z&Lines.unique_parent.<locals>.<genexpr>)bool
_instancesall)selfr   r   r   unique_parent   s   
zLines.unique_parentrawsc                 C   s    |D ]}t |}| | q| S )z$Construct lines from raw dicts list.)r   append)r   r   rawr   r   r   r   restore   s   zLines.restorec                 C   s    g }| j D ]}||j q|S )zGet all ImageSpan instances.)r   extendimage_spans)r   spansr   r   r   r   r   $   s   
zLines.image_spansline_break_free_space_rationew_paragraph_free_space_ratioc                 C   sD  |   }t|}|dkr|S tdd |dd D }tdd |D | }g }t }ttj}	d }
}d }}|D ]^}|d j	 
|	}|d jd |d	 jd	  }|ra|| d
| krad}n|rm|| | |krmd}
|r}|| || t }n|
r|| t }|| n|| |}d }
}q:|r|| |S )a  Split lines into separate paragraph by checking text. The parent text block consists of 
        lines with similar line spacing, while lines in other paragraph might be counted when the
        paragraph spacing is relatively small. So, it's necessary to split those lines by checking
        the text contents.

        .. note::
            Considered only normal reading direction, from left to right, from top
            to bottom.
        r   c                 s   s,    | ]}|d  j d |d j d  V  qdS )r   r   Nbboxr   rowr   r   r   r   >      * z1Lines.split_vertically_by_text.<locals>.<genexpr>Nc                 s   s,    | ]}|d  j d |d  j d  V  qdS )r      r   Nr"   r$   r   r   r   r   ?   r&   Fr!   r   r   g      ?T)group_by_physical_rowslenmaxsumr
   tupler	   ZSENTENSE_END_PUNCtextstripendswithr#   r   r   )r   r   r    rowsnumWHreslinesZpuncZstart_of_paraZend_of_paraZstart_of_senZ
end_of_senr%   wr   r   r   split_vertically_by_text-   s<   






zLines.split_vertically_by_textdelete_end_line_hyphenc                    s   d dd tjD   fdd}t| jdd D ]T\}}|jd }t|ts)q|j}|s/q|d }| j|d  jd	 }t|tsCq|j}	|	sIq|	d	 }
|r]|j	
d
r]|
j	 r]d|_	||j	rn||
j	rn| j	d7  _	qdS )zAdjust word at the end of line:
        # - it might miss blank between words from adjacent lines
        # - it's optional to delete hyphen since it might not at the the end 
           of line after conversion
         c                 s   s    | ]	}|d kr|V  qdS )-Nr   )r   cr   r   r   r   o   s    z)Lines.adjust_last_word.<locals>.<genexpr>c                    s   |   p	| o	|  v S r   )isalnum)r;   Zpunc_ex_hyphenr   r   is_end_of_english_wordp   s   z6Lines.adjust_last_word.<locals>.is_end_of_english_wordNr!   r   r   r:    )joinstringpunctuation	enumerater   r   
isinstancer   charsr;   r/   islower)r   r8   r>   ir   Zend_spanZ	end_charsZend_charZ
start_spanZstart_charsZnext_start_charr   r=   r   adjust_last_wordi   s.   

zLines.adjust_last_wordc                 C   s   d}| j D ]A}|tj}|j|s|jj|jjk r |S qg }|jD ]}t	|t
r1|| q$|||j}|| d}q$|j| q|S )zParse text format with style represented by rectangle shape.
        
        Args:
            shape (Shape): Potential style shape applied on blocks.
        
        Returns:
            bool: Whether a valid text style.
        FT)r   Zget_expand_bboxr	   Z
MAJOR_DISTr#   Z
intersectsy1Zy0r   rD   r   r   splitis_horizontal_textr   reset)r   shapeflagr   Zexpanded_bboxZsplit_spansspanr   r   r   r   parse_text_format   s   	


zLines.parse_text_formatline_break_width_ratioc                 C   s   | j }|jrdnd\}}t|j| |j|  }|| ||  }|| |k}	|  }
|
D ]F}|D ]}d|_q0|jtjkrM|d }t|j| |j|  }n|d }t|j| |j|  }|jtj	krg|d9 }|	so|| |krrd|_q,|
d D ]}d|_qwdS )a  Whether hard break each line. 

        Args:
            bbox (Rect): bbox of parent layout, e.g. page or cell.
            line_break_width_ratio (float): user defined threshold, break line if smaller than this value.
            line_break_free_space_ratio (float): user defined threshold, break line if exceeds this value.

        Hard line break helps ensure paragraph structure, but pdf-based layout calculation may
        change in docx due to different rendering mechanism like font, spacing. For instance, when
        one paragraph row can't accommodate a Line, the hard break leads to an unnecessary empty row.
        Since we can't 100% ensure a same structure, it's better to focus on the content - add line
        break only when it's necessary to, e.g. short lines.
        r   r   r'   r   r   r!   r   r   N)
parentrK   absr#   r(   
line_break	alignmentr   RIGHTZCENTER)r   r#   rQ   r   blockidx0idx1Zblock_widthZlayout_widthrV   r0   r5   r   end_lineZ
free_spacer   r   r   parse_line_break   s$   zLines.parse_line_breakline_separate_thresholdc           	         s   | j   jrdnd\} fdd}tt|| j}ttdd | _ js*dS  j }t	| jD ]0\}}|j | }||krFd|_
|| jd krP dS || j|d  r_|j| n j }q4dS )	zCalculate tab stops for parent block and whether add TAB stop before each line. 

        Args:
            line_separate_threshold (float): Don't need a tab stop if the line gap less than this value.
        rR   rS   c                    s   t t| j  j  dS )Nr   )roundrU   r#   )r   rY   rZ   r   r   <lambda>   s    z&Lines.parse_tab_stop.<locals>.<lambda>c                 S   s
   | t jkS r   )r	   Z
MINOR_DIST)posr   r   r   ra      s   
 Nr   r!   )rT   rK   setmapr   listfilterZ	tab_stopsr#   rC   Ztab_stopZin_same_row)	r   r^   r[   ZfunZall_posrefrG   r   Zdistancer   r`   r   parse_tab_stop   s   

*zLines.parse_tab_stopN)__name__
__module____qualname____doc__propertyr   re   r   r   floatr7   r   rH   rP   r]   rh   r   r   r   r   r
      s     

<%$
2r
   )rl   rA   r   r   Zimage.ImageSpanr   Zcommon.Collectionr   Zcommon.sharer   commonr	   r
   r   r   r   r   <module>   s   