o
    Me                     @   s  d Z ddlmZ ddlZddlZddlZddlZddlm	Z
 ddlm  mZ g ZdZdZdZdZdZd	Zd
ZdZdZdZdd Zdd Zeddddddd Zeddddddd Zeddddddd Zedddddd"ddZedddddd d! ZdS )#a  
WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is
provided. This module will download dataset from
http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators.

    )print_functionNzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgzZ 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgzZ 0791583d57d5beb693b9414c5b36798cz1http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgzZ 0cb4a5366189b6acba876491c8724fa3z<s>z<e>z<unk>   c                 C   s   dd }t j| dd>}dd |D }t|dksJ |||d |}d	d |D }t|dks4J |||d |}||fW  d    S 1 sLw   Y  d S )
Nc                 S   s<   t  }t| D ]\}}||k r||t| < q |S |S )N)dict	enumeratecptto_textstrip)fdsizeZout_dict
line_countline r   DD:\Projects\ConvertPro\env\Lib\site-packages\paddle/dataset/wmt14.py	__to_dict6   s   z!__read_to_dict.<locals>.__to_dictrmodec                 S      g | ]}|j d r|j qS )zsrc.dictnameendswith.0Z	each_itemr   r   r   
<listcomp>@       
z"__read_to_dict.<locals>.<listcomp>   r   c                 S   r   )ztrg.dictr   r   r   r   r   r   F   r   )tarfileopenlenextractfile)tar_file	dict_sizer   fnamessrc_dicttrg_dictr   r   r   __read_to_dict4   s   	$r&   c                    s    fdd}|S )Nc                  3   s(   t \ tjdd{} fdd| D }|D ]f}| |D ]^}t|}| d}t|dkr7q"|d }| } fddt	g| t
g D }|d	 }| }	fd
d|	D }
t|dkskt|
dkrlq"|
t
 g }t	 g|
 }
||
|fV  q"qW d    d S 1 sw   Y  d S )Nr   r   c                    s   g | ]}|j  r|j qS r   r   r   )	file_namer   r   r   T   r   z2reader_creator.<locals>.reader.<locals>.<listcomp>	r   r   c                       g | ]}  |tqS r   getUNK_IDXr   w)r$   r   r   r   `   s    
r   c                    r)   r   r*   r-   )r%   r   r   r   g   s    P   )r&   r   r   r   r   r   r   splitr   STARTEND)r"   r#   r   r   Z
line_splitZsrc_seqZ	src_wordsZsrc_idsZtrg_seqZ	trg_wordsZtrg_idsZtrg_ids_nextr!   r'   r    )r$   r%   r   readerQ   s8   


"zreader_creator.<locals>.readerr   )r    r'   r!   r4   r   r3   r   reader_creatorO   s    r5   z2.0.0zpaddle.text.datasets.WMT14r   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonc                 C      t tjjtdtd| S )a  
    WMT14 training set creator.

    It returns a reader creator, each sample in the reader is source language
    word ID sequence, target language word ID sequence and next word ID
    sequence.

    :return: Training reader creator
    :rtype: callable
    wmt14ztrain/trainr5   paddledatasetcommondownload	URL_TRAIN	MD5_TRAINr!   r   r   r   traint      rB   c                 C   r8   )z
    WMT14 test set creator.

    It returns a reader creator, each sample in the reader is source language
    word ID sequence, target language word ID sequence and next word ID
    sequence.

    :return: Test reader creator
    :rtype: callable
    r9   z	test/testr:   rA   r   r   r   test   rC   rD   c                 C   r8   )Nr9   zgen/genr:   rA   r   r   r   gen   s   rE   Tc                 C   sT   t jjtdt}t|| \}}|r&dd t|D }dd t|D }||fS )Nr9   c                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>       zget_dict.<locals>.<dictcomp>c                 S   rF   r   r   rG   r   r   r   rJ      rK   )	r;   r<   r=   r>   r?   r@   r&   six	iteritems)r!   reverser    r$   r%   r   r   r   get_dict   s   rO   c                   C   s(   t jjtdt t jjtdt d S )Nr9   )r;   r<   r=   r>   r?   r@   	URL_MODEL	MD5_MODELr   r   r   r   fetch   s   rR   )T) __doc__
__future__r   rL   r   gzipZpaddle.dataset.commonr;   Zpaddle.compatcompatr   Zpaddle.utils.deprecatedutils
deprecated__all__ZURL_DEV_TESTZMD5_DEV_TESTr?   r@   rP   rQ   r1   r2   ZUNKr,   r&   r5   rB   rD   rE   rO   rR   r   r   r   r   <module>   sp   	%


