o
    Me'                     @   s  d Z ddlmZ ddlZddlZddlZddlZddlm	Z
 ddlm  mZ ddlmZmZ g ZdZdZdZdZd	Zd
ZdZdZdZdZdZdd Zdd Zdd Z			d$ddZ eddddddd Z!eddddddd Z"edddddd d! Z#edddddd"d# Z$dS )%a^  
Conll05 dataset.
Paddle semantic role labeling Book and demo use this dataset as an example.
Because Conll05 is not free in public, the default downloaded URL is test set
of Conll05 (which is public). Users can change URL and MD5 to their Conll
dataset. And a pre-trained word vector model based on Wikipedia corpus is used
to initialize SRL model.
    )print_functionN)ziprangezBhttp://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gzZ 387719152ae52d60422c016e92a742fcz:http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txtZ ea7fb7d4c75cc6254716f0177a506baaz:http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txtZ 0d2977293bbb6cbefab5b0f97db1e77cz<http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txtZ d8c7f03ceb5fc2e5a0fa7503a4353751z1http://paddlemodels.bj.bcebos.com/conll05st%2FembZ bf436eb0faa1f6f9103017f8be57cdb7c                 C   s   t  }t }t| dR}t|D ]%\}}| }|dr'||dd   q|dr5||dd   qd}|D ]}||d| < |d7 }||d| < |d7 }q:||d< W d    |S 1 s`w   Y  |S )NrB-   I-r      O)dictsetopen	enumeratestrip
startswithadd)filenamedZtag_dictfilineindextag r   FD:\Projects\ConvertPro\env\Lib\site-packages\paddle/dataset/conll05.pyload_label_dict1   s*   




r   c                 C   sT   t  }t| d}t|D ]
\}}||| < qW d    |S 1 s#w   Y  |S )Nr   )r   r   r   r   )r   r   r   r   r   r   r   r   	load_dictE   s   
r   c                    s    fdd}|S )a  
    Read one corpus. It returns an iterator. Each element of
    this iterator is a tuple including sentence and labels. The sentence is
    consist of a list of word IDs. The labels include a list of label IDs.
    :return: a iterator of data.
    :rtype: iterator
    c               
   3   s   t } | }| }tj|d}tj|d}g }g }g }t||D ]\}}	t| }t|	 	 }	t
|	dkrtt
|d D ]  fdd|D }
||
 qNt
|dkrg }|d D ]}|dkrw|| qlt|dd  D ]\ }d}d}g }d	}|D ]u}|d
kr|dkr|d q|d
kr|dkr|d|  q|dkr|d|  d}q|ddkr|ddkr|d|d
 }|d|  d}q|ddkr|ddkr|d|d
 }|d|  d}qtd| ||  |fV  qg }g }g }q+|| ||	 q+W d    n	1 s*w   Y  W d    n	1 s:w   Y  |  |  |   d S )N)fileobjr   c                    s   g | ]}|  qS r   r   ).0xr   r   r   
<listcomp>e   s    z1corpus_reader.<locals>.reader.<locals>.<listcomp>r	   -r
   F *Tr   z*)()r   zUnexpected label: %s)tarfiler   extractfilegzipGzipFiler   cptZto_textr   splitlenr   appendr   findRuntimeErrorclose)tfZwfpfZ
words_fileZ
props_fileZ	sentenceslabelsZone_segwordlabelZa_kind_lableZ	verb_listr   ZlblZcur_tagZis_in_bracketZlbl_seqZ	verb_wordl	data_path
props_name
words_namer    r   readerV   sz   




 6zcorpus_reader.<locals>.readerr   )r:   r<   r;   r=   r   r9   r   corpus_readerM   s   	>r>   c                    s    fdd}|S )Nc               
   3   s     D ]\} }}t | }|d}dgt | }|dkr*d||d < | |d  }nd}|dkr=d||d < | |d  }nd}d||< | | }|t |d k r\d||d < | |d  }	nd}	|t |d k rsd||d < | |d  }
nd}
fdd| D }|tg| }|tg| }|tg| }|	tg| }|
tg| }|g| }fd	d|D }|||||||||f	V  qd S )
NzB-Vr   r	   Zbosr   Zeosc                    s   g | ]}  |tqS r   )getUNK_IDXr   w)	word_dictr   r   r!      s    z2reader_creator.<locals>.reader.<locals>.<listcomp>c                    s   g | ]}  |qS r   )r?   rA   )
label_dictr   r   r!      s    )r.   r   r?   r@   )Zsentence	predicater5   Zsen_lenZ
verb_indexmarkZctx_n1Zctx_n2Zctx_0Zctx_p1Zctx_p2Zword_idxZ
ctx_n2_idxZ
ctx_n1_idxZ	ctx_0_idxZ
ctx_p1_idxZ
ctx_p2_idxZpred_idxZ	label_idxr>   rD   predicate_dictrC   r   r   r=      sF   
zreader_creator.<locals>.readerr   )r>   rC   rH   rD   r=   r   rG   r   reader_creator   s   0rI   z2.0.0zpaddle.text.datasets.Conll05str	   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonc                  C   sL   t tjjtdt} t tjjtdt}t	tjjt
dt}| ||fS )zF
    Get the word, verb and label dictionary of Wikipedia corpus.
    	conll05st)r   paddledatasetcommondownloadWORDDICT_URLWORDDICT_MD5VERBDICT_URLVERBDICT_MD5r   TRGDICT_URLTRGDICT_MD5)rC   	verb_dictrD   r   r   r   get_dict   s   	
rX   c                   C   s   t jjtdtS )z@
    Get the trained word vector based on Wikipedia corpus.
    rL   )rM   rN   rO   rP   EMB_URLEMB_MD5r   r   r   r   get_embedding   s   	r[   c                  C   s6   t  \} }}ttjjtdtddd}t|| ||S )ap  
    Conll05 test set creator.

    Because the training dataset is not free, the test dataset is used for
    training. It returns a reader creator, each sample in the reader is nine
    features, including sentence sequence, predicate, predicate context,
    predicate context flag and tagged sequence.

    :return: Training reader creator
    :rtype: callable
    rL   z2conll05st-release/test.wsj/words/test.wsj.words.gzz2conll05st-release/test.wsj/props/test.wsj.props.gz)r<   r;   )	rX   r>   rM   rN   rO   rP   DATA_URLDATA_MD5rI   )rC   rW   rD   r=   r   r   r   test   s   r^   c                   C   s^   t jjtdt t jjtdt t jjtdt	 t jjt
dt t jjtdt d S )NrL   )rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rY   rZ   r\   r]   r   r   r   r   fetch  s
   r_   )NNN)%__doc__
__future__r   r(   r*   	itertoolsZpaddle.dataset.commonrM   Zpaddle.compatcompatr,   Zpaddle.utils.deprecatedutils
deprecatedZ	six.movesr   r   __all__r\   r]   rQ   rR   rS   rT   rU   rV   rY   rZ   r@   r   r   r>   rI   rX   r[   r^   r_   r   r   r   r   <module>   sp   	K
8


