o
    e_"                     @   sr   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ ed	g d
ZG dd de	ZdS )a3  Extract fonts properties from PDF.

Font properties like font name, size are covered in :py:class:`~pdf2docx.text.TextSpan`, 
but more generic properties are required further:

* Font family name. The font name extracted and set in ``TextSpan`` might not valid when 
  directly used in MS Word, e.g. "ArialMT" should be "Arial". So, we need to get font
  family name, which should be accepted by MS Word, based on the font file itself.

* Font line height ratio. As line height = font_size * line_height_ratio, it's used to 
  calculate relative line spacing. In general, 1.12 is an approximate value to this ratio,
  but it's in fact a font-related value, especially for CJK font. 

    * So, extract font metrics, e.g. ascender and descender, with third party library ``fontTools``
      in first priority. This can obtain an accurate line height ratio, but sometimes the 
      embedded font data might crash.
  
    * Then, we have to use the default properties, i.e. ascender and descender, extracted by
      ``PyMuPDF`` directly, but this value isn't so accurate.    
    N)BytesIO)
namedtuple)TTFont   )BaseCollection)CJK_CODEPAGE_BITSCJK_UNICODE_RANGE_BITSCJK_UNICODE_RANGESFont
descriptornameline_heightc                   @   s~   e Zd ZdZdefddZedd Zedd Z	ed	efd
dZ
edefddZedefddZedefddZdS )Fontsz$Extracted fonts properties from PDF.	font_namec                 C   sb   |  |}| D ]}||jkr|  S q| D ]}||jv r |  S q| D ]}|j|v r.|  S q#dS )z.Get matched font by font name, or return None.N)_to_descriptorr   )selfr   targetZfont r   CD:\Projects\ConvertPro\env\Lib\site-packages\pdf2docx/font/Fonts.pyget&   s   
z	Fonts.getc              	   C   s   t  }|D ]}| D ]	}||d  qqg }|D ]C}||\}}}	}
| |}z|dvs3J dtt|
}| |}| |}W n t	yO   d}Y nw |
t| |||d q| |S )zExtract fonts from PDF and get properties.
        * Only embedded fonts (v.s. the base 14 fonts) can be extracted.
        * The extracted fonts may be invalid due to reason from PDF file itself.
        r   )zn/aZcffzbase font or not supported fontNr   )setZ	get_fontsaddZextract_font_normalized_font_namer   r   get_font_family_nameget_line_height_factor	Exceptionappendr
   r   )clsZfitz_docZxrefspagefZfontsZxrefbasenameext_bufferr   ttr   r   r   r   extract9   s*   


zFonts.extractc                 C   s   |  dd  dd S )zMNormalize raw font name, e.g. BCDGEE+Calibri-Bold, BCDGEE+Calibri -> Calibri.+-r   )splitr   r   r   r   r   _      zFonts._normalized_font_namer   c                 C   s   |  dd dd S )zBRemove potential space, dash in font name, and turn to upper case.  r)   )replaceupperr+   r   r   r   r   e   r,   zFonts._to_descriptortt_fontc                 C   s   d }}d}d}| d j D ]-}d|jv r|jd}n|jd}|j|kr+|s+|}n	|j|kr4|s4|}|r:|r: nqt|S )z{Get the font family name from the font's names table.

        https://gist.github.com/pklaus/dce37521579513c574d0
        r.         r       z	utf-16-bezlatin-1)namesstringdecodeZnameIDr   r   )r1   r   familyZFONT_SPECIFIER_NAME_IDZFONT_SPECIFIER_FAMILY_IDrecordZname_strr   r   r   r   k   s   

zFonts.get_font_family_namec                 C   s   | d j }| d }|j}|j}|j}|t| }|| }tjdkr?| d }|j}	|j}
|	|
 }t	d|||  }|| }|}n|}t
| }|rLd| nd| }|| S )a{  Calculate line height ratio based on ``hhea`` and ``OS/2`` tables.

        Fon non-CJK fonts::

            f = (hhea.Ascent - hhea.Descent + hhea.LineGap) / units_per_em
        
        For non-CJK fonts (Windows)::

            f = (OS/2.winAscent + OS/2.winDescent + [External Leading]) / units_per_em
            External Leading = MAX(0, hhea.LineGap - ((OS/2.WinAscent + OS/2.winDescent) - (hhea.Ascent - hhea.Descent)))

        For CJK fonts::

            f = 1.3 * (hhea.Ascent - hhea.Descent) / units_per_em

        Read more:
        * https://docs.microsoft.com/en-us/typography/opentype/spec/recom#baseline-to-baseline-distances
        * https://github.com/source-foundry/font-line#baseline-to-baseline-distance-calculations
        * https://www.zhihu.com/question/23349103
        * https://github.com/source-foundry/font-line/blob/master/lib/fontline/metrics.py
        headhheantOS/2g        g?g      ?)Z
unitsPerEmZascentZdescentZlineGapabsosr   ZusWinAscentZusWinDescentmaxr   is_cjk_font)r1   Zunits_per_emr;   Zhhea_ascentZhhea_descentZhhea_linegapZhhea_total_heightZhhea_btb_distanceos2Zos2_win_ascentZos2_win_descentZos2_win_total_heightZwin_external_leadingZwin_btb_distanceZbtb_distanceZcjkZdistancer   r   r   r      s&   


zFonts.get_line_height_factorc                 C   s:  | d }t  D ]\}}t|dr|jd|> @ r dS qt D ]L\}}|tddv r;t|dr:|jd|> @ r: dS q |tddv rTt|drS|jd|d > @ rS dS q |tdd	v rlt|d
rl|jd|d > @ rl dS q z| 	 }W n   Y dS |s~dS t
D ]}t|d |d d D ]}t||v r  dS qqdS )a  Test font object to confirm that it meets our definition of a CJK font file.

        The definition is met if any of the following conditions are True:
        1. The font has a CJK code page bit set in the OS/2 table
        2. The font has a CJK Unicode range bit set in the OS/2 table
        3. The font has any CJK Unicode code points defined in the cmap table

        https://github.com/googlefonts/fontbakery/blob/main/Lib/fontbakery/profiles/shared_conditions.py
        r=   ulCodePageRange1r3   Tr       @   ZulCodePageRange2`   ZulCodePageRange3F)r   itemshasattrrC   r   rangeZulUnicodeRange1ZulUnicodeRange2ZulUnicodeRange3ZgetBestCmapr	   int)r1   rB   r#   bitZcmapZunicode_rangexr   r   r   rA      s<   zFonts.is_cjk_fontN)__name__
__module____qualname____doc__strr   classmethodr&   staticmethodr   r   r   r   r   rA   r   r   r   r   r   #   s    
%
8r   )rP   r?   ior   collectionsr   ZfontTools.ttLibr   Zcommon.Collectionr   Zcommon.constantsr   r   r	   r
   r   r   r   r   r   <module>   s    