o
    e                     @  s   d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZ dd	lmZ eeZed
d Zedd Zedd ZG dd deZedd ZdS )z1Built-in plugin to implement OCR using Tesseract.    )annotationsN)hookimpl)	tesseract)numeric
str_to_int)clamp)	OcrEngine)check_external_programc                 C  s   |  dd}|jdddg dd |jdd	td
tdddd |jdd	tdtdddd |jdd	ttjdddd |jddttdddd |jdddd  |jd!dd"d  d S )#NZ	Tesseractz!Advanced control of Tesseract OCRz--tesseract-configappendZCFGz=Additional Tesseract configuration files -- see documentation)actionmetavardefaulthelpz--tesseract-pagesegmodestoreZPSMr      z;Set Tesseract page segmentation mode (see tesseract --help))r   typer   choicesr   z--tesseract-oemZMODE   z|Set Tesseract 4+ OCR engine mode: 0 - original Tesseract only; 1 - neural nets LSTM only; 2 - Tesseract + LSTM; 3 - default.z--tesseract-thresholdingautoZMETHODaA  Set Tesseract 5.0+ input image thresholding mode. This may improve OCR results on low quality images or those that contain high constrast color. legacy-otsu is the Tesseract default; adaptive-otsu is an improved Otsu algorithm with improved sort for background color changes; sauvola is based on local standard deviation.)r   r   r   r   r   z--tesseract-timeoutg     f@ZSECONDSzVGive up on OCR after the timeout, but copy the preprocessed page into the final output)r   r   r   r   z--user-wordsFILEa  Specify the location of the Tesseract user words file. This is a list of words Tesseract should consider while performing OCR in addition to its standard language dictionaries. This can improve OCR quality especially for specialized and technical documents.)r   r   z--user-patternsz9Specify the location of the Tesseract user patterns file.)	add_argument_groupadd_argumentintranger   r   ZTESSERACT_THRESHOLDING_METHODSr   float)parserZtess r   VD:\Projects\ConvertPro\env\Lib\site-packages\ocrmypdf/builtin_plugins/tesseract_ocr.pyadd_options   sb   
r   c                 C  sb   t ddditjdtjd | jdkrd| _t s#| jdkr#td	 | j	d
v r/td d S d S )Nr   linuxztesseract-ocrz4.1.1)programpackageZversion_checkerZneed_versionZversion_parserr   sandwichr   zThe installed version of Tesseract does not support changes to its thresholding method. The --tesseract-threshold argument will be ignored.)r      zdThe --tesseract-pagesegmode argument you select will disable OCR. This may cause processing to fail.)
r	   r   versionZTesseractVersionpdf_rendererZhas_thresholdingtesseract_thresholdinglogwarningtesseract_pagesegmodeoptionsr   r   r   check_optionsZ   s$   
	
r,   c                 C  sV   t jdd st|jt|  dd}t|t jd< ntt jd }t	
d| d S )NZOMP_THREAD_LIMIT       z&Using Tesseract OpenMP thread limit %d)osenvironget	isnumericr   jobslenstrr   r'   debug)Zpdfinfor+   Ztess_threadsr   r   r   validateu   s
   
r8   c                   @  sn   e Zd ZdZedd Zedd Zdd Zedd	 Zed
d Z	edddZ
edd Zedd ZdS )TesseractOcrEnginezImplements OCR with Tesseract.c                   C     t  S N)r   r$   r   r   r   r   r$         zTesseractOcrEngine.versionc                 C  s&   | j dkrdnd}d| dt  S )Nr"   z-PDFr-   zTesseract OCR )r%   r9   r$   )r+   tagr   r   r   creator_tag   s   zTesseractOcrEngine.creator_tagc                 C  s   dt   S )NzTesseract OCR )r9   r$   )selfr   r   r   __str__   s   zTesseractOcrEngine.__str__c                 C  r:   r;   )r   Zget_languagesr*   r   r   r   	languages   r<   zTesseractOcrEngine.languagesc                 C  s   t j| |j|jdS )N)engine_modetimeout)r   get_orientationtesseract_oemtesseract_timeout
input_filer+   r   r   r   rE      s
   z"TesseractOcrEngine.get_orientationreturnr   c                 C  s   t j| |j|j|jdS )N)rB   rC   rD   )r   
get_deskewrB   rF   rG   rH   r   r   r   rK      s   zTesseractOcrEngine.get_deskewc                 C  4   t j| |||j|j|j|j|j|j|j|j	d d S )N)rI   output_hocroutput_textrB   rC   
tessconfigrD   pagesegmodethresholding
user_wordsuser_patterns)
r   generate_hocrrB   rF   tesseract_configrG   r)   r&   rR   rS   )rI   rM   rN   r+   r   r   r   rT         
z TesseractOcrEngine.generate_hocrc                 C  rL   )N)rI   
output_pdfrN   rB   rC   rO   rD   rP   rQ   rR   rS   )
r   generate_pdfrB   rF   rU   rG   r)   r&   rR   rS   )rI   rW   rN   r+   r   r   r   rX      rV   zTesseractOcrEngine.generate_pdfN)rJ   r   )__name__
__module____qualname____doc__staticmethodr$   r?   rA   rB   rE   rK   rT   rX   r   r   r   r   r9      s"    




r9   c                   C  s   t  S r;   )r9   r   r   r   r   get_ocr_engine   s   r^   )r\   
__future__r   loggingr0   Zocrmypdfr   Zocrmypdf._execr   Zocrmypdf.clir   r   Zocrmypdf.helpersr   Zocrmypdf.pluginspecr   Zocrmypdf.subprocessr	   	getLoggerrY   r'   r   r,   r8   r9   r^   r   r   r   r   <module>   s(   

D

E