o
    Me                     @   s   d Z ddlmZ ddlZddlm  mZ ddlZddl	Z	ddl
Z
g ZdZdZG dd deZddd	ZdddZdd ZedddddejfddZedddddejfddZeddddddd ZdS )z
imikolov's simple dataset.

This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators.
    )print_functionNz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgzZ 30177ea32e27c525793142b6bf2c8e2dc                   @   s   e Zd ZdZdZdS )DataType      N)__name__
__module____qualname__NGRAMSEQ r   r   GD:\Projects\ConvertPro\env\Lib\site-packages\paddle/dataset/imikolov.pyr   %   s    r   c                 C   sb   |d u r	t t}| D ]#}|  D ]
}||  d7  < q|d  d7  < |d  d7  < q|S )Nr   <s><e>)collectionsdefaultdictintstripsplit)f	word_freqlwr   r   r   
word_count*   s   
r   2   c              	      s   d}d}t tjjtjjjdtjjjR}|	|}|	|}t
|t
|}d|v r/|d=  fddt|D }t|dd d	}tt| \}}	ttt|tjt|}
t||
d< W d
   |
S 1 skw   Y  |
S )z
    Build a word dictionary from the corpus,  Keys of the dictionary are words,
    and values are zero-based IDs of these words.
    $./simple-examples/data/ptb.train.txt$./simple-examples/data/ptb.valid.txtimikolov<unk>c                    s   g | ]
}|d   kr|qS )r   r   ).0xmin_word_freqr   r   
<listcomp>I   s    zbuild_dict.<locals>.<listcomp>c                 S   s   | d  | d fS )Nr   r   r   )r   r   r   r   <lambda>M   s    zbuild_dict.<locals>.<lambda>)keyN)tarfileopenpaddledatasetcommondownloadr   URLMD5extractfiler   six	iteritemssortedlistzipdictmovesrangelen)r!   Ztrain_filenameZtest_filenametfZtrainfZtestfr   Zword_freq_sortedwords_word_idxr   r    r   
build_dict7   s2   



r;   c                    s    fdd}|S )Nc                  3   s\   t tjjtjjjdtjjj} | 	}d  |D ]|}t
jkrddks.J ddg|   dg }t|krc fdd|D }tjt|d	 D ]}t|| | V  qUqt
jkr|  } fd
d|D }d g| }|d g }dkrt|krq||fV  qJ dW d    d S 1 sw   Y  d S )Nr   r   zInvalid gram lengthr   r   c                       g | ]} | qS r   getr   r   UNKr:   r   r   r"   d       z2reader_creator.<locals>.reader.<locals>.<listcomp>r   c                    r=   r   r>   r@   rA   r   r   r"   i   rC   r   FzUnknow data type)r%   r&   r'   r(   r)   r*   r   r+   r,   r-   r   r	   r   r   r6   r.   r4   r5   tupler
   )r7   r   r   iZsrc_seqZtrg_seq	data_typefilenamenr:   )rB   r   readerW   s:   



"zreader_creator.<locals>.readerr   )rH   r:   rI   rG   rJ   r   rF   r   reader_creatorU   s   rK   z2.0.0zpaddle.text.datasets.Imikolovr   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonc                 C      t d| ||S )a  
    imikolov training set creator.

    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Training reader creator
    :rtype: callable
    r   rK   r:   rI   rG   r   r   r   traint      rQ   c                 C   rN   )a  
    imikolov test set creator.

    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Test reader creator
    :rtype: callable
    r   rO   rP   r   r   r   test   rR   rS   c                   C   s   t jjtdt d S )Nr   )r'   r(   r)   r*   r+   r,   r   r   r   r   fetch   s   rT   )N)r   )__doc__
__future__r   Zpaddle.dataset.commonr'   Zpaddle.utils.deprecatedutils
deprecatedr   r%   r.   __all__r+   r,   objectr   r   r;   rK   r	   rQ   rS   rT   r   r   r   r   <module>   sF   

