o
    e ^                     @  s6  d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZ ddlmZ ddlZdd	lmZmZmZmZmZmZmZmZm Z  dd
l!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ e0e1Z2dZ3dZ4ede5Z6G dd deZ7ddddZ8deddZ9dedd Z:dfd&d'Z;dgd)d*Z<dgd+d,Z=dhd0d1Z>did3d4Z?djd6d7Z@dkd<d=ZAdld>d?ZBdmdCdDZCdndGdHZDdgdIdJZEdodMdNZFdpdOdPZGdqdSdTZHdrdXdYZIe% ZJeJfdsd\d]ZKdtd_d`ZLe1dakreLejMd^ ejMdb ejMdc  dS dS )uz/Post-processing image optimization of OCR PDFs.    )annotationsN)defaultdict)fspath)Path)CallableIterator
MutableSet
NamedTupleNewTypeSequence)compress)	
DictionaryNameObjectObjectStreamModePdfPdfErrorPdfImageStreamUnsupportedImageTypeError)Image)ExecutorSerialExecutor)jbig2encpngquant)
PdfContext)OutputFileAccessError)IMG2PDF_KWARGSsafe_symlinkK   F   Xrefc                   @  s"   e Zd ZU dZded< ded< dS )XrefExtz$A PDF xref and image extension pair.r!   xrefstrextN)__name__
__module____qualname____doc____annotations__ r+   r+   AD:\Projects\ConvertPro\env\Lib\site-packages\ocrmypdf/optimize.pyr"   0   s   
 r"   rootr   r#   r%   r$   returnc                 C  s   | |d|  S )N08dr+   )r-   r#   r%   r+   r+   r,   img_name7   s   r0   c                 C     t | |dS )N.pngr0   r-   r#   r+   r+   r,   png_name;      r5   c                 C  r1   )N.jpgr3   r4   r+   r+   r,   jpg_name?   r6   r8   piker   imager   +tuple[PdfImage, tuple[Name, Object]] | Nonec                 C  s  ~ ~|j tjkr
d S |jdk rtd| d d S |jdk s$|jdk r/td| d d S t|}t	|j
dkr|j
d }|j
d }t	|j
dkrw|d tjkrw|d tjddkrw|d tjkrw|d swtd| d	 |j
d }ntd| d
 d S |j
d }|jdkrtd| d d S |d tjkrtd| d d S |d tjkr|d dddkrtd| d d S tj|v rtd| d d S ||fS )Nd   xref z': skipping image with small stream size   z0: skipping image with unusually small dimensions   r      zQ: found image compressed as /FlateDecode /DCTDecode, marked for JPEG optimizationz2: skipping image with multiple compression filtersz: skipping wide gamut imagez: skipping JPEG2000 imagez/Kz: skipping CCITT Group 3 imagez": skipping image with Decode table)ZSubtyper   r   LengthlogdebugZWidthZHeightr   lenZfilter_decodeparmsFlateDecodegetZ	Predictor	DCTDecodebits_per_componentZ	JPXDecodeZCCITTFaxDecodeZDecode)r9   r-   r:   r#   pimZfirst_filtdpZsecond_filtdpfiltdpr+   r+   r,   extract_image_filterC   sP   





"
rK   XrefExt | Nonec                 C  s:  ~t | |||}|d u rd S |\}}|jdkr|d tjkrt r|jtjd }|d us1|j	rzWz0tj
|j_||d }	|	d}
|j|
d}W d    n1 sTw   Y  |	|	| W n tyz   Y W |d uru||j_d S |j`d S w W |d ur||j_n|j`n|d ur||j_w |j`w t||S d S )Nr?   r   r/   wbstream)rK   rH   r   JBIG2Decoder   	availableobjrF   Z
ColorSpaceZ
image_maskZ
DeviceGrayopen
extract_torenamewith_suffixr   r"   )r9   r-   r:   r#   optionsresultrI   rJ   
colorspaceimgnamefr%   r+   r+   r,   extract_image_jbig2|   sB   





r\   c                 C  s  t | |||}|d u rd S |\}}|jdkrd S |d tjkr_|jdkr_z+||d }|d}	|j|	d}
W d    n1 sAw   Y  |||
 W n
 t	yY   Y d S w t
||
S |jr||j|jv r||jdkr|| t|| t
|dS |js|j|jv rz| t|| W n ty   td	 Y d S w t
|dS |js|jtjkr|jdkr|js| t|| t
|dS d S )
Nr?   r   r@   r/   rM   rN      r2   z8PDF contains an atypical image that cannot be optimized.)rK   rH   r   rG   optimizerS   rT   rU   rV   r   r"   ZindexedrY   ZSIMPLE_COLORSPACESZas_pil_imagesaver5   NotImplementedErrorrB   warningZICCBasedjbig2_lossy)r9   r-   r:   r#   rW   rX   rI   rJ   rZ   r[   r%   r+   r+   r,   extract_image_generic   sP   
	






rc   
extract_fnCallable[..., XrefExt | None]Iterator[tuple[int, XrefExt]]c              	   c  sh   t  }t  }i }d}t| jD ]_\}}	z|	jj}
W n	 ty#   Y qw t|
 D ]D\}}|jd dkr6q*t	|jd }t
j|v rXt	|jjd }|| td| d || td| d ||vrn|||< q*q|| }|D ];}| |df}z|| ||||d}W n ty   td| d |d7 }Y qvw |r|\}}|| t||fV  qvdS )	a  Extract image using extract_fn

    Enumerate images on each page, lookup their xref/ID number in the PDF.
    Exclude images that are soft masks (i.e. alpha transparency related).
    Record the page number on which an image is first used, since images may be
    used on multiple pages (or multiple times on the same page).

    Current we do not check Form XObjects or other objects that may contain
    images, and we don't evaluate alternate images or thumbnails.

    extract_fn must decide if wants to extract the image in this context. If
    it does a tuple should be returned: (xref, ext) where .ext is the file
    extension. extract_fn must also extract the file it finds interesting.
    r   r?   r=   z': skipping image because it is an SMaskz': treating as an optimization candidate)r9   r-   r:   r#   rW   z0: While extracting this image, an error occurredN)set	enumeratepagesZ	ResourcesZXObjectAttributeErrordictitemsZobjgenr!   r   ZSMaskaddrB   rC   
get_object	Exception	exceptionr"   )r9   r-   rW   rd   Zinclude_xrefsZexclude_xrefsZpageno_for_xreferrorspagenopageZxobjsZ_imnamer:   r#   Z
smask_xrefZworking_xrefsrX   _r%   r+   r+   r,   extract_images   sV   





ru   tuple[list[Xref], list[Xref]]c                 C  s   g }g }t | ||tD ]!\}}td| |jdkr!||j q|jdkr,||j qtdt| dt|  ||fS )z0Extract any >=2bpp image we think we can improvez%sr2   r7   zOptimizable images: JPEGs: z PNGs: )ru   rc   rB   rC   r%   appendr#   rD   )r9   r-   rW   jpegspngsrt   xref_extr+   r+   r,   extract_images_generic!  s   

r{   dict[int, list[XrefExt]]c                 C  sP   t t}t| ||tD ]\}}||j }|| | qtdt|  |S )z?Extract any bitonal image that we think we can improve as JBIG2z"Optimizable images: JBIG2 groups: )	r   listru   r\   jbig2_page_group_sizerw   rB   rC   rD   )r9   r-   rW   jbig2_groupsrr   rz   groupr+   r+   r,   extract_images_jbig22  s   
r   r   executorr   Nonec              	   C  sf   ddd}ddd}|j d	kr|}tj}n|}tj}|d
|jtt| dd|j d|||| d dS )z&Produce JBIG2 images from their groupsr-   r   groupsr|   c                 3  sB    |  D ]\}}d|d}t  fdd|D |fV  qd S )Nr   r/   c                 3  s     | ]\}}t  ||V  qd S Nr3   ).0r#   r%   r-   r+   r,   	<genexpr>H  s    zB_produce_jbig2_images.<locals>.jbig2_group_args.<locals>.<genexpr>)rl   r   )r-   r   r   	xref_extsprefixr+   r   r,   jbig2_group_argsC  s   z/_produce_jbig2_images.<locals>.jbig2_group_argsc           	   	   s  sf    |  D ]+\}}d|d}t|D ]\}}|\}}t| t| ||| | d|d fV  qqd S )Nr   r/   .04d)rl   rh   r   r0   )	r-   r   r   r   r   nrz   r#   r%   r+   r+   r,   jbig2_single_argsL  s   
z0_produce_jbig2_images.<locals>.jbig2_single_argsr?   TZJBIG2item)totaldescunitdisableuse_threadsmax_workerstqdm_kwargstasktask_argumentsN)r-   r   r   r|   )r   r|   )r~   r   Zconvert_group_mpZconvert_single_mpjobsrk   rD   progress_bar)r   r-   rW   r   r   r   Z
jbig2_argsZjbig2_convertr+   r+   r,   _produce_jbig2_images>  s&   

	

r   c                 C  s   t |||| | D ]Y\}}d|d}||d  }| r.| }	t| |	}
t|
d}n|jdkr6d}nt|t|D ]%\}}|\}}||d|d  }| }| 	|d	}|j
|tj|d
 q>qdS )a  Convert images to JBIG2 and insert into PDF.

    When the JBIG2 page group size is > 1 we do several JBIG2 images at once
    and build a symbol dictionary that will span several pages. Each JBIG2
    image must reference to its symbol dictionary. If too many pages shared the
    same dictionary JBIG2 encoding becomes more expensive and less efficient.
    The default value of 10 was determined through testing. Currently this
    must be lossy encoding since jbig2enc does not support refinement coding.

    When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own
    and needs no dictionary. Currently this must be lossless JBIG2.
    r   r/   z.sym)ZJBIG2Globalsr?   Nr   r   r   filterZdecode_parms)r   rl   exists
read_bytesr   r   r~   FileNotFoundErrorrh   rn   writer   rP   )r9   r   r-   rW   r   r   r   r   Zjbig2_symfileZjbig2_globals_dataZjbig2_globalsZjbig2_globals_dictr   rz   r#   rt   Zjbig2_im_fileZjbig2_im_dataim_objr+   r+   r,   convert_to_jbig2m  s*   

r   argstuple[Xref, Path, Path, int]tuple[Xref, Path | None]c                 C  s   | \}}}}t |}|j|d|d W d    n1 sw   Y  | j| jkr>td| d |  |d fS ||fS )NT)r^   Zqualityr=   z, jpeg, made larger - skip)r   rS   r_   statst_sizerB   rC   unlink)r   r#   in_jpgopt_jpgjpeg_qualityZimr+   r+   r,   _optimize_jpeg  s   r   rx   Sequence[Xref]c              	     sP   d fdd}dfdd}|d	j td
t dj dt| |d d S )Nr.   &Iterator[tuple[Xref, Path, Path, int]]c                  3  s4     D ]} t | }|d}| ||jfV  qd S )Nz.opt.jpg)r8   rV   r   )r#   r   r   )rx   rW   r-   r+   r,   	jpeg_args  s   

z"transcode_jpegs.<locals>.jpeg_argsrX   r   c                   s<   | \}}|r|  } |d}|j|tjd |  d S Nr   )r   )r   rn   r   r   rG   update)rX   pbarr#   r   compdatar   )r9   r+   r,   finish_jpeg  s   z$transcode_jpegs.<locals>.finish_jpegTzRecompressing JPEGsr:   r   r   r   r   r   r   r   r   r   Ztask_finished)r.   r   )rX   r   )r   rk   rD   r   r   )r9   rx   r-   rW   r   r   r   r+   )rx   rW   r9   r-   r,   transcode_jpegs  s   
r   c                 C  sP   t | |||}|d u rd S |\}}|d tjkr&|d s&|jdkr&t|dS d S )Nr   r?   z.memory)rK   r   rG   r^   r"   )r9   r-   r:   r#   rW   rX   Z_pimrJ   r+   r+   r,   _find_deflatable_jpeg  s    
r   %tuple[Pdf, threading.Lock, Xref, int]tuple[Xref, bytes]c              	   C  s   | \}}}}|) | |d}z| }W n ty)   |df Y W  d    S w W d    n1 s4w   Y  t||}t|t|krJ|dfS ||fS )Nr       )rn   read_raw_bytesr   r   rD   )r   r9   lockr#   	complevelxobjdatar   r+   r+   r,   _deflate_jpeg  s   

r   c           	   	     s   g t ||tD ]\}}|j}td| d | q	|jdkr&dnd t d fdd	}fd
d}|d|j	t
dtd|j dt| |d d S )Nr=   z!: marking this JPEG as deflatabler]   	      r.   r   c                  3  s    D ]	} |  fV  qd S r   r+   r#   r   rx   r   r9   r+   r,   deflate_args  s   z#deflate_jpegs.<locals>.deflate_argsc                   sf   | \}}t |dkr-  |d}|j|tjtjgd W d    n1 s(w   Y  |  d S r   )rD   rn   r   r   rE   rG   r   )rX   r   r#   r   r   )r   r9   r+   r,   finish  s   zdeflate_jpegs.<locals>.finishTzDeflating JPEGsr:   r   r   )r.   r   )ru   r   r#   rB   rC   rw   r^   	threadingLockr   rk   rD   r   r   )	r9   r-   rW   r   Z_pagenorz   r#   r   r   r+   r   r,   deflate_jpegs  s,   
r   filenameboolc                 C  s,  | d}|d}tjt|fd|it W d    n1 s"w   Y  t|`}tt|j	d j
 }| |}| |d}|j| |j|jd t| t|  }	h d}
|	|
8 }	| D ]}|tjkr{t||
vr{|| ||< qh|	D ]}||= q~W d    dS 1 sw   Y  dS )Nz.png.pdfrM   Zoutputstreamr   r   >	   z/IDz/OCz/Interpolatez/SMaskz/OPIz/Maskz/Intentz/StructParentz	/MetadataT)rV   rS   img2pdfconvertr   r   r   nextiterri   imagesvaluesZcopy_foreignrn   r   r   FilterZDecodeParmsrg   keysr   rA   r$   )r9   r   r#   outputr[   Z	pdf_imageZforeign_imageZlocal_imager   Zdel_keysZkeep_fieldskeyr+   r+   r,   _transcode_png
  s6   


!!r   r   image_name_fnCallable[[Path, Xref], Path]c           	   	     s   t  |jdkr8td|jd td|jd f fdd}|d|jtdtd|j d	t	j
| d
 D ]}t|}t| || q:d S )Nr@   
   r<   c                  3  sL    D ] } t  |   | t| d d fV  |  qd S )Nr   r?   )rB   rC   r5   rm   r   r   r   modifiedpng_qualityr-   r+   r,   pngquant_argsB  s   z%transcode_pngs.<locals>.pngquant_argsTZPNGsr:   r   r   )rg   r^   maxr   minr   rk   rD   r   r   Zquantize_mpr5   r   )	r9   r   r   r-   rW   r   r   r#   r   r+   r   r,   transcode_pngs3  s,   

r   
input_fileoutput_filec                 C  s  |j }|jdkrt| | |S |jdkr|jdk rtnd|_|jdkr-|jdk r*tnd|_|jdkr:|jr7dnd|_t	
| S}|jd }|jdd	 t|||\}}	t||||| t|||| t||	t||| t|||}
t||
||| |d
}|  |j|fi | W d    n1 sw   Y  |  j}| j}|dkrtdt  dd||  }|dk rtd t	
| }|  |j|fi | W d    |S 1 sw   Y  |S t|| |S )Nr   r]   (      r   r?   r   T)exist_okz.opt.pdfzeOutput file not created after optimizing. We probably ran out of disk space in the temporary folder: r   zLImage optimization did not improve the file - optimizations will not be used)rW   r^   r   r   DEFAULT_JPEG_QUALITYr   DEFAULT_PNG_QUALITYr~   rb   r   rS   parentmkdirr{   r   r   r   r5   r   r   rV   Zremove_unreferenced_resourcesr_   r   r   r   tempfile
gettempdirrB   info)r   r   contextZsave_settingsr   rW   r9   r-   rx   ry   r   Ztarget_fileZ
input_sizeZoutput_sizeZsavingsr+   r+   r,   r^   b  sZ   










r^   r?   c              
   C  s   ddl m} ddlm} G dd d}t| } || |t|dddd}| -}t||| d d }	t|d }
t| |
|	td	d	t	j
d
 |t|
t| W d    d S 1 sVw   Y  d S )Nr   )copy)TemporaryDirectoryc                   @  s   e Zd ZdZdd ZdS )zmain.<locals>.OptimizeOptionszEmulate ocrmypdf's optionsc                 S  s:   || _ || _|| _|| _|| _d| _|| _d| _d| _d S )Nr   TF)	r   r   r^   r   r   r~   rb   quietr   )selfr   r   	optimize_r   r   jb2lossyr+   r+   r,   __init__  s   
z&main.<locals>.OptimizeOptions.__init__N)r&   r'   r(   r)   r   r+   r+   r+   r,   OptimizeOptions  s    r   F)r   r   r   r   r   r   zout.pdfT)Zcompress_streamsZpreserve_pdfaZobject_stream_mode)shutilr   r   r   r   intr   r^   rk   r   generater   )infileoutfilelevelr   r   r   r   rW   Ztmpdirr   Ztmpoutr+   r+   r,   main  s6   	
"r   __main__r@   r]   )r-   r   r#   r!   r%   r$   r.   r   )r-   r   r#   r!   r.   r   )
r9   r   r-   r   r:   r   r#   r!   r.   r;   )
r9   r   r-   r   r:   r   r#   r!   r.   rL   )r9   r   r-   r   rd   re   r.   rf   )r9   r   r-   r   r.   rv   )r9   r   r-   r   r.   r|   )r   r|   r-   r   r   r   r.   r   )
r9   r   r   r|   r-   r   r   r   r.   r   )r   r   r.   r   )
r9   r   rx   r   r-   r   r   r   r.   r   )r   r   r.   r   )r9   r   r-   r   r   r   r.   r   )r9   r   r   r   r#   r!   r.   r   )
r9   r   r   r   r   r   r-   r   r.   r   )r   r   r   r   r   r   r.   r   )r?   )Nr)   
__future__r   loggingsysr   r   collectionsr   osr   pathlibr   typingr   r   r   r	   r
   r   zlibr   r   Zpikepdfr   r   r   r   r   r   r   r   r   ZPILr   Zocrmypdf._concurrentr   r   Zocrmypdf._execr   r   Zocrmypdf._jobcontextr   Zocrmypdf.exceptionsr   Zocrmypdf.helpersr   r   	getLoggerr&   rB   r   r   r   r!   r"   r0   r5   r8   rK   r\   rc   ru   r{   r   r   r   r   r   r   r   r   r   r   ZDEFAULT_EXECUTORr^   r   argvr+   r+   r+   r,   <module>   sb    ,






9
)
>
>


/
-

 


'
),
>
."