o
    eH9                     @   sT  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl m"Z" d dl m#Z# d dl$m%Z% dZ&edduZ'e'rd dl(m)Z) edduZ*e*rd dl+Z,dZ-e .dZ/dZ0h dZ1de2fde2fd e2fd!e3fd"e4fd#e3fd$Z5e#d%Z6e#d&Z7G d'd( d(Z8G d)d* d*e9Z:G d+d, d,e;Z<G d-d. d.e9Z=G d/d0 d0e9Z>G d1d2 d2e9Z?d3d4 Z@edgd5d6ZAd7d8 ZBd9d: ZCd;d< ZDd=d> ZEed?d@ ZFdhdBdCZG	D	 	 didEdFZH	D		D	 	 	GdjdHdIZIdJdK ZJdLdM ZKdNdO ZLeBdkdPdQZMeBdRdS ZNddDd e8jOd fdTdUZP		D	 	V	 dldWdXZQ		D	 	 dmdYdZZRddDd e8jOd fd[d\ZSdgd]d^ZTddDd e8jOd dfd_d`ZUdadDd e8jOd fdbdcZVddde ZWeXdfkreYeW  dS dS )n    N)contextmanager)
QUOTE_NONE)ENOENT)wraps)iglob)BytesIO)environ)extsep)linesep)remove)normcase)normpath)realpath)find_loader)NamedTemporaryFile)sleep)InvalidVersion)parse)Version)ImageZ	tesseractnumpy)ndarraypandaszutf-8z	^[a-z_]+$RGB>
   ZGIFZTIFFZWEBPZJPEG2000ZPPMZPGMZPBMBMPZJPEGPNGZpage_numZorientationrotateZorientation_confscriptZscript_conf)zPage numberzOrientation in degreesZRotatezOrientation confidenceZScriptzScript confidencez3.05z4.1.0c                   @   s   e Zd ZdZdZdZdZdS )Outputbytesz
data.framedictstringN)__name__
__module____qualname__BYTES	DATAFRAMEDICTSTRING r)   r)   GD:\Projects\ConvertPro\env\Lib\site-packages\pytesseract/pytesseract.pyr   E   s
    r   c                          e Zd Z fddZ  ZS )PandasNotSupportedc                       t  d d S )NzMissing pandas packagesuper__init__self	__class__r)   r*   r0   M   s   zPandasNotSupported.__init__r"   r#   r$   r0   __classcell__r)   r)   r3   r*   r,   L       r,   c                   @   s   e Zd Zdd ZdS )TesseractErrorc                 C   s   || _ || _||f| _d S N)statusmessageargs)r2   r:   r;   r)   r)   r*   r0   R   s   zTesseractError.__init__N)r"   r#   r$   r0   r)   r)   r)   r*   r8   Q   s    r8   c                       r+   )TesseractNotFoundErrorc                    s   t  t d d S )NzQ is not installed or it's not in your PATH. See README file for more information.)r/   r0   tesseract_cmdr1   r3   r)   r*   r0   Y   s   zTesseractNotFoundError.__init__r5   r)   r)   r3   r*   r=   X   r7   r=   c                       r+   )TSVNotSupportedc                    r-   )Nz4TSV output not supported. Tesseract >= 3.05 requiredr.   r1   r3   r)   r*   r0   a      zTSVNotSupported.__init__r5   r)   r)   r3   r*   r?   `   r7   r?   c                       r+   )ALTONotSupportedc                    r-   )Nz6ALTO output not supported. Tesseract >= 4.1.0 requiredr.   r1   r3   r)   r*   r0   h   r@   zALTONotSupported.__init__r5   r)   r)   r3   r*   rA   g   r7   rA   c                 C   s   |    z;z| d W n ty   td Y n ty!   Y nw W |   || _d S W |   || _d S W |   || _d S |   || _w )N   )	terminatewait	TypeErrorr   	Exceptionkill
returncode)processcoder)   r)   r*   rG   n   s&   


rG   c                 c   s    zL|s|   d V  W | j  | j  | j  d S z| j |d\}}|V  W n tjy;   t| d tdw W | j  | j  | j  d S | j  | j  | j  w )NrB   )timeoutzTesseract process timeout)	communicatestdinclosestdoutstderr
subprocessTimeoutExpiredrG   RuntimeError)procseconds_error_stringr)   r)   r*   timeout_manager{   s,   
	






rY   c                    s    t   fdd_S )Nc                     s    j u r | i |_ j S r9   )_result)r<   kwargsfuncwrapperr)   r*   r^      s   
zrun_once.<locals>.wrapper)r   rZ   )r]   r)   r\   r*   run_once   s   r_   c                 C   s"   d dd | t D  S )N c                 s   s    | ]}|V  qd S r9   r)   .0liner)   r)   r*   	<genexpr>   s    
zget_errors.<locals>.<genexpr>)joindecodeDEFAULT_ENCODING
splitlinesstrip)rX   r)   r)   r*   
get_errors   s
   
rj   c                 C   s\   t | r|  dn| D ] }zt| W q ty+ } z|jtkr! W Y d}~qd}~ww dS )z5Tries to remove temp files by filename wildcard path.*N)r   r   OSErrorerrnor   )	temp_namefilenameer)   r)   r*   cleanup   s   
rq   c                 C   s   t rt| trt| } t| tjstd| jsdn| j}|tvr&tdd|  v r@t	t
| jd}|| d| d |} || _| |fS )NzUnsupported image objectr   zUnsupported image format/typeA)   rs   rs   )r   r   )numpy_installed
isinstancer   r   Z	fromarrayrE   formatSUPPORTED_FORMATSZgetbandsnewRGB_MODEsizeZpasteZ
getchannel)image	extension
backgroundr)   r)   r*   prepare   s   
r~   c                 c   s    zdt dddF}t| tr)|jttt| fV  	 W d    W t|j d S t| \} }|j dt	 | }| j
|| jd |j|fV  W d    n1 sQw   Y  W t|j d S W t|j d S t|j w )NZtess_F)prefixdelete_input)rv   )r   ru   strnamer   r   r   rq   r~   r	   saverv   )r{   fr|   Zinput_file_namer)   r)   r*   r      s    
			r   Tc                 C   sh   t jt jd td}tt dr$t  |d< |d  jt jO  _t j|d _| r-t j|d< |S t j	|d< |S )N)rN   rQ   startupinfoenvSTARTUPINFOr   rP   )
rR   PIPEr   hasattrr   dwFlagsSTARTF_USESHOWWINDOWSW_HIDEwShowWindowDEVNULL)Zinclude_stdoutr[   r)   r)   r*   subprocess_args   s   


r    c              
   C   s   g }t jds|dkr|ddt|f7 }|t| |f7 }|d ur&|d|f7 }|r/|t|7 }|r:|dvr:|| ztj	|fi t
 }W n ty[ }	 z	|	jtkrT t d }	~	ww t||}
|jrmt|jt|
W d    d S 1 sxw   Y  d S )Nwin32r   nicez-n-l>   tsvosdxmlbox)sysplatform
startswithr   r>   shlexsplitappendrR   Popenr   rl   rm   r   r=   rY   rH   r8   rj   )input_filenameoutput_filename_baser|   langconfigr   rK   cmd_argsrU   rp   rX   r)   r)   r*   run_tesseract   s.   	

"r   Fc              	   C   s   t | ]\}}|||||||d}	tdi |	 |	d  t | }
t|
d*}|r<| W  d    W  d    S | tW  d    W  d    S 1 sTw   Y  W d    d S 1 sdw   Y  d S )N)r   r   r|   r   r   r   rK   r   rbr)   )r   r   r	   openreadrf   rg   )r{   r|   r   r   r   rK   Zreturn_bytesrn   r   r[   ro   Zoutput_filer)   r)   r*   run_and_get_output  s*   

"r   c              
      s   i } fdd|   dD }t|dk r|S |d}t|}t|d |k r0|d d |dk r8||7 }t|D ]<\}}t ||< |D ]0}	t|	|krPqG||krlz
tt|	| }
W n t	yk   |	| }
Y nw |	| }
|| |
 qGq<|S )Nc                    s   g | ]}|  qS r)   r   )rb   rowcell_delimiterr)   r*   
<listcomp>*  s    z file_to_dict.<locals>.<listcomp>
   r   rL   r   )
ri   r   lenpopr   	enumeratelistintfloat
ValueError)r   r   Zstr_col_idxresultrowsheaderlengthiheadr   valr)   r   r*   file_to_dict(  s2   

r   c                 C   s@   |t u r|  S |tu rzt|  W dS  ty   Y dS w dS )NTF)r   isdigitr   r   )r   _typer)   r)   r*   is_validK  s   r   c                 C   s   dd dd |  dD D S )Nc                 S   sX   i | ](}t |d krt|d t|d  d rt|d  d t|d  d |d qS )r   rB   r   )r   r   OSD_KEYS)rb   kvr)   r)   r*   
<dictcomp>Z  s
    &$zosd_to_dict.<locals>.<dictcomp>c                 s   s    | ]}| d V  qdS ): Nr   ra   r)   r)   r*   rd   \  s    zosd_to_dict.<locals>.<genexpr>r   r   )r   r)   r)   r*   osd_to_dictY  s   r   c                 C   s   t dg}| r|t| 7 }ztj|tjtjd}W n
 ty#   t w |j	dvr,t g }|j
rK|j
ttD ]}| }t|rJ|| q:|S )Nz--list-langs)rP   rQ   )r   rB   )r>   r   r   rR   runr   STDOUTrl   r=   rH   rP   rf   rg   r
   ri   LANG_PATTERNmatchr   )r   r   r   	languagesrc   r   r)   r)   r*   get_languagesa  s,   



r   c               	   C   s   zt jtdgt jtt jd} W n
 ty   t w | t	}|
tjdd d^}}|d^}}zt|}|tks@J W |S  ttfyS   td| dw )	z9
    Returns Version object of the Tesseract version
    z	--version)rQ   r   rN   
   Nr`   -zInvalid tesseract version: "")rR   check_outputr>   r   r   r   rl   r=   rf   rg   lstripr!   	printable	partitionr   TESSERACT_MIN_VERSIONAssertionErrorr   
SystemExit)outputZraw_versionZstr_versionrW   versionr)   r)   r*   get_tesseract_version~  s(   

r   c                    sD   | d||||g t j fddt j fddt j fddi|  S )zS
    Returns the result of a Tesseract OCR run on the provided image to string
    txtc                         t  dg  S NTr   r)   r<   r)   r*   <lambda>      z!image_to_string.<locals>.<lambda>c                      s   dt   iS )Ntextr   r)   r   r)   r*   r         c                         t   S r9   r   r)   r   r)   r*   r         )r   r%   r'   r(   r{   r   r   r   output_typerK   r)   r   r*   image_to_string  s   r   pdfc                 C   s0   |dvrt d| | |||||dg}t| S )zU
    Returns the result of a Tesseract OCR run on the provided image to pdf/hocr
    >   r   ZhocrzUnsupported extension: T)r   r   )r{   r   r   r   r|   rK   r<   r)   r)   r*   image_to_pdf_or_hocr  s   r   c                 C   s8   t  tk rt d|  }| d||||dg}t| S )zU
    Returns the result of a Tesseract OCR run on the provided image to ALTO XML
    z-c tessedit_create_alto=1 r   T)r   TESSERACT_ALTO_VERSIONrA   ri   r   )r{   r   r   r   rK   r<   r)   r)   r*   image_to_alto_xml  s
   
r   c                    sR   |   d}| d||||g tj fddtj fddtj fddi|  S )zR
    Returns string containing recognized characters and their box boundaries
    z batch.nochop makeboxr   c                      r   r   r   r)   r   r)   r*   r     r   z image_to_boxes.<locals>.<lambda>c                      s   t dt   ddS )Nz char left bottom right top page
r`   r   r   r   r)   r   r)   r*   r     s
    c                      r   r9   r   r)   r   r)   r*   r     r   ri   r   r%   r'   r(   r   r)   r   r*   image_to_boxes  s   r   c              	   C   sT   t st tdd}z|| W n ttfy   Y nw tjtt	|  fi |S )N	)quotingsep)
pandas_installedr,   r   updaterE   r   pdZread_csvr   r   )r<   r   r[   r)   r)   r*   get_pandas_output  s   
r   c              
      sr   t  tk rt d|  }| d||||g tj fddtj fddtj fddtj fddi|  S )zt
    Returns string containing box boundaries, confidences,
    and other information. Requires Tesseract 3.05+
    z-c tessedit_create_tsv=1 r   c                      r   r   r   r)   r   r)   r*   r     r   zimage_to_data.<locals>.<lambda>c                      s   t  dg S r   )r   r)   r<   pandas_configr)   r*   r     s    c                      s   t t  ddS )Nr   rL   r   r)   r   r)   r*   r     s    c                      r   r9   r   r)   r   r)   r*   r     r   )	r   r   r?   ri   r   r%   r&   r'   r(   )r{   r   r   r   r   rK   r   r)   r   r*   image_to_data  s   
r   r   c                    sR   d|   }| d||||g tj fddtj fddtj fddi|  S )zN
    Returns string containing the orientation and script detection (OSD)
    z--psm 0 r   c                      r   r   r   r)   r   r)   r*   r   )  r   zimage_to_osd.<locals>.<lambda>c                      s   t t  S r9   )r   r   r)   r   r)   r*   r   *  r   c                      r   r9   r   r)   r   r)   r*   r   +  r   r   r   r)   r   r*   image_to_osd  s   r   c               
   C   s,  t tjdkrtjd d } }n#t tjdkr*tjd dkr*tjd tjd } }n	tdtjd dS z"t| }tt||d W d    W d S 1 sNw   Y  W d S  tyt } ztt	| d	tjd W Y d }~dS d }~w t
y } ztt|j d
| tjd W Y d }~dS d }~ww )Nr   rB      r      z(Usage: pytesseract [-l lang] input_file
)file)r   r   r   )r   r   argvprintrQ   r   r   r   r=   r   rl   typer"   )ro   r   Zimgrp   r)   r)   r*   main/  s&   &r  __main__r9   )T)r   r   r   )r   Nr   r   r   F)r   )Nr   r   r   r   )Nr   r   r   )Zrer   r!   rR   r   
contextlibr   csvr   rm   r   	functoolsr   globr   ior   osr   r	   r
   r   Zos.pathr   r   r   pkgutilr   tempfiler   timer   Zpackaging.versionr   r   r   ZPILr   r>   rt   r   r   r   r   r   rg   compiler   ry   rw   r   r   r   r   r   r   r   EnvironmentErrorr,   rT   r8   r=   r?   rA   rG   rY   r_   rj   rq   r~   r   r   r   r   r   r   r   r   r   r(   r   r   r   r   r   r   r   r  r"   exitr)   r)   r)   r*   <module>   s   
	



(
#






!

