o
    e,                  
   @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ dd Zdd	 Z	d
d Z
dd Zdd ZdddZdddddddddd	ZdddZG dd deZdS )zZ
This code is refer from: https://github.com/weizwx/html2docx/blob/master/htmldocx/h2d.py
    N)Document)BeautifulSoup)
HTMLParserc                 C   s   g d}| j d|ddS )Nz
table > trztable > thead > trztable > tbody > trztable > tfoot > trz, F	recursive)selectjoin)
table_souptable_row_selectors r   \D:\Projects\ConvertPro\env\Lib\site-packages\paddleocr/ppstructure/recovery/table_process.pyget_table_rows   s   r   c                 C   s   | r| j ddgddS g S )NthtdFr   )find_all)rowr   r   r   get_table_columns"   s   r   c                 C   sL   t | }|rt|d ng }d}|D ]}|jdd}|t|7 }q||fS )Nr   colspan   )r   r   attrsgetint)r
   rowscolsZ	col_countcolr   r   r   r   get_table_dimensions'   s   r   c                 C   s   d dd | jD S )N c                 S   s   g | ]}t |qS r   )str).0ir   r   r   
<listcomp>;   s    z!get_cell_html.<locals>.<listcomp>)r	   contents)soupr   r   r   get_cell_html7   s   r$   c                 C   s$   | j }| | d  |_|_ d S N)Z_elementZ	getparentremove_p)	paragraphpr   r   r   delete_paragraph>   s   r*   Fc                 C   s@   |r	t dd| } |rt dd| } t dd| } t dd| S )aP  Remove white space from a string.
    Args:
        string(str): The string to remove white space from.
        leading(bool, optional): Remove leading new lines when True.
        trailing(bool, optional): Remove trailing new lines when False.
    Returns:
        str: The input string with new line characters removed and white space squashed.
    Examples:
        Single or multiple new line characters are replaced with space.
            >>> remove_whitespace("abc\ndef")
            'abc def'
            >>> remove_whitespace("abc\n\n\ndef")
            'abc def'
        New line characters surrounded by white space are replaced with a single space.
            >>> remove_whitespace("abc \n \n \n def")
            'abc def'
            >>> remove_whitespace("abc  \n  \n  \n  def")
            'abc def'
        Leading and trailing new lines are replaced with a single space.
            >>> remove_whitespace("\nabc")
            ' abc'
            >>> remove_whitespace("  \n  abc")
            ' abc'
            >>> remove_whitespace("abc\n")
            'abc '
            >>> remove_whitespace("abc  \n  ")
            'abc '
        Use ``leading=True`` to remove leading new line characters, including any surrounding
        white space:
            >>> remove_whitespace("\nabc", leading=True)
            'abc'
            >>> remove_whitespace("  \n  abc", leading=True)
            'abc'
        Use ``trailing=True`` to remove trailing new line characters, including any surrounding
        white space:
            >>> remove_whitespace("abc  \n  ", trailing=True)
            'abc'
    z
^\s*\n+\s* z
\s*\n+\s*$z\s*\n\s*r   z\s+)resub)stringleadingZtrailingr   r   r   remove_whitespaceE   s   (r0   bolditalic	underlinestrikeZsuperscriptZ	subscript)	bstrongZemr    ussupr-   r   ZCourier)codeprec                       sh   e Zd Z fddZdddZdd Zdd	 Zd
d Zdd Zdd Z	dddZ
dd Zdd Z  ZS )
HtmlToDocxc                    s4   t    ddddd| _g d| _d | _d | _d S )NT)fix-htmlimagestablesstylesr   )super__init__optionsr   table_styleparagraph_styleself	__class__r   r   rB      s   


zHtmlToDocx.__init__Nc                 C   sn   g g d| _ |r|| _nt | _| jd | _| j| _d| _| jd | _| jd | _d | _	d| _
d | _d| _d S )N)spanlistr=   Tr>   r@   Fr   )tagsdocr   rC   bsdocumentinclude_tablesZinclude_imagesZinclude_stylesr(   skipZskip_tagZinstances_to_skip)rG   rO   r   r   r   set_initial_attrs   s"   
zHtmlToDocx.set_initial_attrsc                 C   s   |j | _ |j| _dS )z1Copy settings from another instance of HtmlToDocxN)rD   rE   )rG   otherr   r   r   copy_settings_from   s   zHtmlToDocx.copy_settings_fromc                 C   s<   g }d}|D ]}|r|d8 }q| | t|d}q|S )a  
        Returns array containing only the highest level tables
        Operates on the assumption that bs4 returns child elements immediately after
        the parent element in `find_all`. If this changes in the future, this method will need to be updated
        :return:
        r   r   table)appendlenr   )rG   Ztables_soupZ
new_tablesnestrU   r   r   r   ignore_nested_tables   s   
zHtmlToDocx.ignore_nested_tablesc                 C   s2   t | ds
d| _d S | | jd| _d| _d S )Nr#   FrU   r   )hasattrrP   rY   r#   r   r?   Ztable_norF   r   r   r   
get_tables   s
   

zHtmlToDocx.get_tablesc                 C   s<   | j rtrt|d| _t| j}| jr|   | | d S )Nhtml.parser)rN   r   r#   r   rP   r[   feed)rG   htmlr   r   r   run_process   s   

zHtmlToDocx.run_processc                 C   sj   t |tjjstdtjj |jd }|jdkrt| | | | 	| | j
js3| j
d d S d S )Nz Second argument needs to be a %sr   r+   )
isinstancedocxrU   _Cell
ValueErrorZ
paragraphstextr*   rR   r_   rM   add_paragraph)rG   r^   cellZunwanted_paragraphr   r   r   add_html_to_cell   s   



zHtmlToDocx.add_html_to_cellc              
   C   s\   z|r
|| j _W d S | jr| j| j _W d S W d S  ty- } z
td| j d|d }~ww )NzUnable to apply style .)r(   stylerE   KeyErrorrc   )rG   ri   er   r   r   apply_paragraph_style   s   z HtmlToDocx.apply_paragraph_stylec                 C   s2  t |d}t|\}}|t||}|jd |_d}t|D ]w\}}	t|	}
d}|
D ]f}t|j	
dd}t|j	
dd}t|}|jdkrLd| }|||}|jd	krf|d7 }|||}|jd	ksW||| d || d }||kr}|| t }||  ||pd
| ||7 }q+|d7 }qdS )a*  
        To handle nested tables, we will parse tables manually as follows:
        Get table soup
        Create docx table
        Iterate over soup and fill docx table with new instances of this parser
        Tell HTMLParser to ignore any tags until the corresponding closing table tag
        r\   z
Table Gridr   r   r   rowspanr   z	<b>%s</b>r+   r   N)r   r   Z	add_tablerW   r@   ri   	enumerater   r   r   r   r$   namerf   rd   merger<   rT   rg   )rG   r^   rM   r
   r   Zcols_lenrU   Zcell_rowindexr   r   Zcell_colr   r   rm   Z	cell_htmlZ	docx_cellZcell_to_mergeZchild_parserr   r   r   handle_table   s<   








zHtmlToDocx.handle_tablec           	      C   s   | j rd S d| jvrt|dd}| js| j | _|   | jd}|r/| |d | d S | j	|| _
| jd }|D ]}d|v rO| |d }| | q=| jD ]}|tv ret| }t| j
j|d |tv rrt| }|| j
j_qSd S )Nr;   TahrefrJ   ri   )rQ   rL   r0   r(   rM   re   rl   r   Zhandle_linkZadd_runrunZparse_dict_stringZadd_styles_to_runfont_stylessetattrZfont
font_namesro   )	rG   datalinkspansrJ   ri   tagZ
font_styleZ	font_namer   r   r   handle_data  s4   




zHtmlToDocx.handle_datar%   )__name__
__module____qualname__rB   rR   rT   rY   r[   r_   rg   rl   rr   r}   __classcell__r   r   rH   r   r<      s    


+r<   )FF)__doc__r,   ra   r   Zbs4r   html.parserr   r   r   r   r$   r*   r0   rv   rx   r<   r   r   r   r   <module>   s4   	
6