资源简介
LSTM数据集+python源码,实测在Theano环境平台下可用!详情见我的博客:http://blog.csdn.net/zhongkelee/article/details/52090352
代码片段和文件信息
from __future__ import print_function
from six.moves import xrange
import six.moves.cPickle as pickle
import gzip
import os
import numpy
import theano
def prepare_data(seqs labels maxlen=None):
“““Create the matrices from the datasets.
This pad each sequence to the same lenght: the lenght of the
longuest sequence or maxlen.
if maxlen is set we will cut all sequence to this maximum
lenght.
This swap the axis!
“““
# x: a list of sentences
lengths = [len(s) for s in seqs]
if maxlen is not None:
new_seqs = []
new_labels = []
new_lengths = []
for l s y in zip(lengths seqs labels):
if l < maxlen:
new_seqs.append(s)
new_labels.append(y)
new_lengths.append(l)
lengths = new_lengths
labels = new_labels
seqs = new_seqs
if len(lengths) < 1:
return None None None
n_samples = len(seqs)
maxlen = numpy.max(lengths)
x = numpy.zeros((maxlen n_samples)).astype(‘int64‘)
x_mask = numpy.zeros((maxlen n_samples)).astype(theano.config.floatX)
for idx s in enumerate(seqs):
x[:lengths[idx] idx] = s
x_mask[:lengths[idx] idx] = 1.
return x x_mask labels
def get_dataset_file(dataset default_dataset origin):
‘‘‘Look for it as if it was a full path if not try local file
if not try in the data directory.
Download dataset if it is not present
‘‘‘
data_dir data_file = os.path.split(dataset)
if data_dir == ““ and not os.path.isfile(dataset):
# Check if dataset is in the data directory.
new_path = os.path.join(
os.path.split(__file__)[0]
“..“
“data“
dataset
)
if os.path.isfile(new_path) or data_file == default_dataset:
dataset = new_path
if (not os.path.isfile(dataset)) and data_file == default_dataset:
from six.moves import urllib
print(‘Downloading data from %s‘ % origin)
urllib.request.urlretrieve(origin dataset)
return dataset
def load_data(path=“imdb.pkl“ n_words=100000 valid_portion=0.1 maxlen=None
sort_by_len=True):
‘‘‘Loads the dataset
:type path: String
:param path: The path to the dataset (here IMDB)
:type n_words: int
:param n_words: The number of word to keep in the vocabulary.
All extra words are set to unknow (1).
:type valid_portion: float
:param valid_portion: The proportion of the full train set used for
the validation set.
:type maxlen: None or positive int
:param maxlen: the max sequence length we use in the train/valid set.
:type sort_by_len: bool
:name sort_by_len: Sort by the sequence lenght for the train
valid and test set. This allow faster execution as it cause
less padding per minibatch. Another mechanism must be used to
shuffle the train set
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 33213513 2016-07-29 20:06 lstm\imdb.pkl
文件 5405 2016-07-29 19:57 lstm\imdb.py
文件 5242 2016-07-29 20:04 lstm\imdb.pyc
文件 22671 2016-07-29 19:50 lstm\lstm.py
文件 5649642 2016-07-29 21:48 lstm\lstm_model.npz
文件 470 2016-07-29 21:41 lstm\lstm_model.npz.pkl
目录 0 2016-07-29 20:36 lstm\
- 上一篇:CNN+pythoncode8.18.zip
- 下一篇:人脸识别Python代码
评论
共有 条评论