a
    d                     @   s   d dl Z d dlZd dlmZ d dlZd dlZd dlmZmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ dd	lmZ d
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zed kreed! dS )"    N)Path)	PdfReader	PdfMerger)Output)	Converter)PdfPdfImage)PPStructure   )HocrTransformc                 C   s    t | } |  dd} t| S )N z\ )strstripreplacer   )	file_name r   y/home/svplap0089/Collabll/API Projects/DocumentConvertor/convertor/docconvertor/fileconverter/utils/document_converter.pyfix_file_name   s    r   c                 C   sF   t d|  t| }t| j| |  }|| |  |dfS )NzThe input file name is T)printr   r   parentconvertclose)	file_pathr   format	converteroutput_file_pathr   r   r   convert_pdf_to_docx   s    
r   c                 C   s<   t j| dddd}|jdkr8t | |  }|dfS dS )NT)shellcapture_outputtextr   )failF)
subprocessrun
returncoder   cwd)shell_commandr   output_formatZprocess_outputr   r   r   r   run_shell_command   s    
r'   c                 C   sZ   | d}t t|j| d}||  W d    n1 s>0    Y  t|j| dfS )Nz.txtwT)openr   r   write)txt_datainput_file_pathinput_file_namer   Ztxt_fpr   r   r   create_txt_file)   s    
(r.   c                 C   s(   d}t | }|jD ]}|| 7 }q|S )N )r   pagesextract_text)r   r+   
pdf_readerpager   r   r   extract_text_data0   s
    
r4   c                 C   s   t d t| }g }|jD ]}|jddd}t|j d }|j| }t|d}| | d}	t	|}
|

 }|dkr|j| dd	}||	 ||	 q|S )
NzExtracting images from pdfz/Rotate1   )defaultr   z/Name.pngT)expand)r   r   r)   r0   getlistimageskeysr   r   Zas_pil_imagerotatesaveappend)Zpdf_file_pathtmp_dirpdf_fileZimage_locationsr3   rotationZ	image_keyZ	raw_imageZ
image_nameZimage_locationZ	pdf_imageZ	pil_imager   r   r   extract_images_from_pdf8   s     



rC   c           
   	   C   s   t d g }| D ]}t|}tt|}tj|dd}||t|}|j|j	 d }t
|d(}	|	| |t| W d    q1 s0    Y  q|S )NzCreating HOCR fileshocr)	extensionz.hocrr(   )r   r   cv2imreadr   pytesseractZimage_to_pdf_or_hocrZget_updated_hocr_datar   stemr)   r*   r?   )
img_locationsfont_detectorhocr_file_locationsimg_locationimg_location_pathZimg_dataocr_dataZupdate_ocr_dataZhocr_file_locationZhocr_fpr   r   r   create_hocr_fileP   s    
.rP   c              	   C   s  t d g }| D ]}t|}tt|}|t|jd }tj|t	j
d}t|d }t|D ]|}|d | |d | |d | |d | f\}	}
}}|d | d	krb|d
 |  rbt||	|
f|	| |
| fdtj qbtt|| |t| q|S )NzRemoving text from imagesZ_ocr)output_typelevellefttopwidthheight   r      rY   rY   )r   r   rF   rG   r   	with_stemrI   rH   Zimage_to_datar   DICTlenranger   	rectangleFILLEDimwriter?   )rJ   Zbackground_imgsrM   rN   imgZnew_path_namerO   Zn_boxesixyr(   hr   r   r   create_background_imagesc   s&    



 &rf   c           	      C   s`   t d g }t| ||D ]B\}}}t|d}t|dd}|j|d dd|d || q|S )NzConverting HOCR to pdfz.pdfg     b@)Zhocr_filenamedpiFT)Zout_filenameZbackground_image_filenameZshow_bounding_boxesZinterword_spacesfigure_details)r   zipr   with_suffixr   Zto_pdfr?   )	rL   Z background_removed_img_locationsrh   Zcombined_pdf_file_locationsZ	hocr_fileZbackground_removed_imgZfigure_detailZoutput_file_locationrD   r   r   r   combine_text_image~   s     rk   c              	   C   s0  t d tdddd}g }g }| D ]}t|}g }t|}||}t|D ]\}	}
|
d dkrN|
d \}}}}t }||	 d}|||||f }tt	|| |
d |d< t	||d	< t
|||f||fd
tj || qN||j d}tt	|| || |t	| q"||fS )NzExtracting figures from imageTF)figureocrZshow_logtyperl   bboxr7   namerX   Z_bl)r   r	   r   rF   rG   	enumeratedict	with_namer`   r   r^   r_   r?   rZ   rI   )rJ   Zlayout_analysis_modelZfigure_location_listZnew_img_locationsrM   Ztmp_img_locationZtmp_figure_location_listZtmp_imgZ
tmp_resultindexlinerS   rT   rU   rV   Ztmp_dictZtmp_figure_nameZ
tmp_figurenew_file_namer   r   r   extract_figure_from_img   s6    


rw   c                 C   s8   t d t }| D ]}|| q|| |  |S )NzMerging pdf files)r   r   r?   r*   r   )pdf_file_locationsZinput_filepathZ
pdf_mergerpdfr   r   r   merge_pdf_files   s    
rz   __main__zReport Final.docx) shutilr!   pathlibr   rF   rH   PyPDF2r   r   r   pdf2docxr   Zpikepdfr   r   Z	paddleocrr	   hocr_to_pdfr   r   r   r'   r.   r4   rC   rP   rf   rk   rw   rz   __name__r   r   r   r   r   <module>   s.   	
"
