o
    Me                     @   s   d Z ddlmZ ddlZddlm  mZ ddlZddl	Z	ddl
Z
ddlZddlZg ZdZdZdd Zdd	 Zed
dddddd Zed
dddddd Zed
dddddd Zed
dddddd Zed
dddddd ZdS )a  
IMDB dataset.

This module downloads IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary.
    )print_functionNz6https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gzZ 7c2ac02c03563afcf9b574c7e56c153ac                 c   s    t tjjtdtC}| }|dkrHt	| 
|jr8|| tddttj  V  | }|dksW d   dS W d   dS 1 sSw   Y  dS )zQ
    Read files that match the given pattern.  Tokenize and yield each file.
    imdbNz
)tarfileopenpaddledatasetcommondownloadURLMD5nextboolmatchnameextractfilereadrstripsixb	translatestringpunctuationlowersplit)patternZtarftf r   CD:\Projects\ConvertPro\env\Lib\site-packages\paddle/dataset/imdb.pytokenize(   s$   
"r   c           	         s   t t}t| D ]}|D ]
}||  d7  < qq	 fddt|D }t|dd d}tt| \}}t	tt|tj
t|}t||d< |S )z
    Build a word dictionary from the corpus. Keys of the dictionary are words,
    and values are zero-based IDs of these words.
       c                    s   g | ]
}|d   kr|qS )r   r   ).0xcutoffr   r   
<listcomp>G   s    zbuild_dict.<locals>.<listcomp>c                 S   s   | d  | d fS )Nr   r   r   )r!   r   r   r   <lambda>I   s    zbuild_dict.<locals>.<lambda>)key<unk>)collectionsdefaultdictintr   r   	iteritemssortedlistzipdictmovesrangelen)	r   r#   Z	word_freqdocword
dictionarywords_word_idxr   r"   r   
build_dict<   s   
r9   z2.0.0zpaddle.text.datasets.Imdbr   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonc                    sB   d g  fdd}||  d || d  fdd}|S )Nr'   c                    s0   t | D ]}| fdd|D |f qd S )Nc                    s   g | ]} | qS r   )get)r    wUNKr8   r   r   r$   [   s    z0reader_creator.<locals>.load.<locals>.<listcomp>)r   append)r   outlabelr3   r>   r   r   loadY   s    zreader_creator.<locals>.loadr   r   c                  3   s     D ]	\} }| |fV  qd S )Nr   )r3   rB   )INSr   r   reader`   s   zreader_creator.<locals>.readerr   )Zpos_patternZneg_patternr8   rC   rE   r   )rD   r?   r8   r   reader_creatorP   s   rF   c                 C      t tdtd| S )a  
    IMDB training set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Training reader creator
    :rtype: callable
    zaclImdb/train/pos/.*\.txt$zaclImdb/train/neg/.*\.txt$rF   recompiler8   r   r   r   traing      

rL   c                 C   rG   )a  
    IMDB test set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Test reader creator
    :rtype: callable
    zaclImdb/test/pos/.*\.txt$zaclImdb/test/neg/.*\.txt$rH   rK   r   r   r   test|   rM   rN   c                   C   s   t tddS )za
    Build a word dictionary from the corpus.

    :return: Word dictionary
    :rtype: dict
    z/aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$   )r9   rI   rJ   r   r   r   r   	word_dict   s   
rP   c                   C   s   t jjtdt d S )Nr   )r   r   r   r	   r
   r   r   r   r   r   fetch   s   rQ   )__doc__
__future__r   Zpaddle.dataset.commonr   Zpaddle.utils.deprecatedutils
deprecatedr(   r   rI   r   r   __all__r
   r   r   r9   rF   rL   rN   rP   rQ   r   r   r   r   <module>   sb   	



