o
    Me6                     @   s(  d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	Z	ddl
mZ ddlm  mZ g ZdZdZdZdZd	Zd
ZdZdd Zd%ddZdd Zdd Zedddddd&ddZedddddd&ddZedddddd&dd Zedddddd%d!d"Zedddddd#d$ Z dS )'aW  
ACL2016 Multimodal Machine Translation. Please see this website for more
details: http://www.statmt.org/wmt16/multimodal-task.html#task1

If you use the dataset created for your task, please cite the following paper:
Multi30K: Multilingual English-German Image Descriptions.

@article{elliott-EtAl:2016:VL16,
 author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
 title     = {Multi30K: Multilingual English-German Image Descriptions},
 booktitle = {Proceedings of the 6th Workshop on Vision and Language},
 year      = {2016},
 pages     = {70--74},
 year      = 2016
}
    )print_functionN)defaultdictz2http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gzZ 0c38be43600334966403524a40dcd81ei+  iK  z<s>z<e>z<unk>c              	   C   sV  t t}tj| dd>}|dD ]0}t|}| d}t	|dkr&q|dkr.|d n|d }| D ]
}	||	  d7  < q6qW d    n1 sLw   Y  t|d	K}
|

td
tttf  ttt|dd ddD ]\}}|d |kr~ n|

t|d  |

td qrW d    d S W d    d S 1 sw   Y  d S )Nrmodewmt16/train	   enr      wbz	%s
%s
%s
c                 S   s   | d S )Nr    )xr   r   DD:\Projects\ConvertPro\env\Lib\site-packages\paddle/dataset/wmt16.py<lambda>H   s    z__build_dict.<locals>.<lambda>T)keyreverse   
)r   inttarfileopenextractfilecptto_textstripsplitlenwriteto_bytes
START_MARKEND_MARKUNK_MARK	enumeratesortedsix	iteritems)tar_file	dict_sizeZ	save_pathlang	word_dictfline
line_splitZsenwZfoutidxwordr   r   r   __build_dict8   s:   
	
"r1   Fc           	      C   s   t jtjjjd||f }t j|rtt	|d
 |kr&t| ||| i }t	|d'}t|D ]\}}|rBt| ||< q2||t| < q2W d    |S 1 sWw   Y  |S )Nwmt16/%s_%d.dictrb)ospathjoinpaddledatasetcommon	DATA_HOMEexistsr   r   	readlinesr1   r#   r   r   r   )	r'   r(   r)   r   	dict_pathr*   Zfdictr/   r,   r   r   r   __load_dictO   s$   

r>   c                 C   s4   t | |dkrtnt} t ||dkrtnt}| |fS )Nr
   )minTOTAL_EN_WORDSTOTAL_DE_WORDSsrc_dict_sizetrg_dict_sizesrc_langr   r   r   __get_dict_size`   s   rF   c                    s    fdd}|S )Nc                  3   s,   t  t dkrdnd t }  t } t dkr%dnd}d| }tjddZ}|D ]K}t|}|	 
d}t|dkrMq8|| 
 }| g fd	d
|D  |g }|| 
 }	fdd
|	D }
|
|g }| g|
 }
||
|fV  q8W d    d S 1 sw   Y  d S )Nr
   der   r   r   r   r   r	   c                       g | ]}  |qS r   get.0r.   )src_dictunk_idr   r   
<listcomp>   s    z2reader_creator.<locals>.reader.<locals>.<listcomp>c                    rH   r   rI   rK   )trg_dictrN   r   r   rO      s    )r>   r    r!   r"   r   r   r   r   r   r   r   r   )Zstart_idZend_idZsrc_colZtrg_colr+   r,   r-   Z	src_wordsZsrc_idsZ	trg_wordsZtrg_idsZtrg_ids_next	file_namerC   rE   r'   rD   )rM   rP   rN   r   readerj   s>   


"zreader_creator.<locals>.readerr   )r'   rR   rC   rD   rE   rS   r   rQ   r   reader_creatorh   s   "rT   z2.0.0zpaddle.text.datasets.WMT16r   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonr
   c                 C   B   |dvrt dt| ||\} }ttjjtdtdd| ||dS )a}  
    WMT16 train set reader.

    This function returns the reader for train data. Each sample the reader
    returns is made up of three fields: the source language word index sequence,
    target language word index sequence and next word index sequence.


    NOTE:
    The original like for training data is:
    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz

    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
    using moses's tokenization script:
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl

    Args:
        src_dict_size(int): Size of the source language dictionary. Three
                            special tokens will be added into the dictionary:
                            <s> for start mark, <e> for end mark, and <unk> for
                            unknown word.
        trg_dict_size(int): Size of the target language dictionary. Three
                            special tokens will be added into the dictionary:
                            <s> for start mark, <e> for end mark, and <unk> for
                            unknown word.
        src_lang(string): A string indicating which language is the source
                          language. Available options are: "en" for English
                          and "de" for Germany.

    Returns:
        callable: The train reader.
    r
   rG   zIAn error language type.  Only support: en (for English); de(for Germany).wmt16wmt16.tar.gzr   r'   rR   rC   rD   rE   	
ValueErrorrF   rT   r7   r8   r9   downloadDATA_URLDATA_MD5rB   r   r   r   train   s   '
ra   c                 C   rW   )a}  
    WMT16 test set reader.

    This function returns the reader for test data. Each sample the reader
    returns is made up of three fields: the source language word index sequence,
    target language word index sequence and next word index sequence.

    NOTE:
    The original like for test data is:
    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz

    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
    using moses's tokenization script:
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl

    Args:
        src_dict_size(int): Size of the source language dictionary. Three
                            special tokens will be added into the dictionary:
                            <s> for start mark, <e> for end mark, and <unk> for
                            unknown word.
        trg_dict_size(int): Size of the target language dictionary. Three
                            special tokens will be added into the dictionary:
                            <s> for start mark, <e> for end mark, and <unk> for
                            unknown word.
        src_lang(string): A string indicating which language is the source
                          language. Available options are: "en" for English
                          and "de" for Germany.

    Returns:
        callable: The test reader.
    rX   HAn error language type. Only support: en (for English); de(for Germany).rY   rZ   z
wmt16/testr[   r\   rB   r   r   r   test   s   &
rc   c                 C   rW   )a  
    WMT16 validation set reader.

    This function returns the reader for validation data. Each sample the reader
    returns is made up of three fields: the source language word index sequence,
    target language word index sequence and next word index sequence.

    NOTE:
    The original like for validation data is:
    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz

    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
    using moses's tokenization script:
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl

    Args:
        src_dict_size(int): Size of the source language dictionary. Three
                            special tokens will be added into the dictionary:
                            <s> for start mark, <e> for end mark, and <unk> for
                            unknown word.
        trg_dict_size(int): Size of the target language dictionary. Three
                            special tokens will be added into the dictionary:
                            <s> for start mark, <e> for end mark, and <unk> for
                            unknown word.
        src_lang(string): A string indicating which language is the source
                          language. Available options are: "en" for English
                          and "de" for Germany.

    Returns:
        callable: The validation reader.
    rX   rb   rY   rZ   z	wmt16/valr[   r\   rB   r   r   r   
validation   s   %
rd   c                 C   st   | dkr
t |t}nt |t}tjtjjj	d| |f }tj
|s'J d	 	 tjtjjj	d}t||| |S )a  
    return the word dictionary for the specified language.

    Args:
        lang(string): A string indicating which language is the source
                      language. Available options are: "en" for English
                      and "de" for Germany.
        dict_size(int): Size of the specified language dictionary.
        reverse(bool): If reverse is set to False, the returned python
                       dictionary will use word as key and use index as value.
                       If reverse is set to True, the returned python
                       dictionary will use index as key and word as value.

    Returns:
        dict: The word dictionary for the specific language.
    r
   r2   z Word dictionary does not exist. rZ   )r?   r@   rA   r4   r5   r6   r7   r8   r9   r:   r;   r>   )r)   r(   r   r=   r'   r   r   r   get_dict,  s   

re   c                   C   s   t jjjtdtd dS )z!download the entire dataset.
    rY   rZ   N)r7   Zv4r8   r9   r^   r_   r`   r   r   r   r   fetchO  s   rf   )F)r
   )!__doc__
__future__r   r4   r%   r   gzipcollectionsr   r7   Zpaddle.compatcompatr   Zpaddle.utils.deprecatedutils
deprecated__all__r_   r`   r@   rA   r    r!   r"   r1   r>   rF   rT   ra   rc   rd   re   rf   r   r   r   r   <module>   sr   
'00.