• 大小: 8KB
    文件类型: .py
    金币: 1
    下载: 0 次
    发布日期: 2021-06-02
  • 语言: Python
  • 标签: lstm  nlp  

资源简介

基于lstm的语义相似度计算模型,使用百度qa的数据集进行实验。

资源截图

代码片段和文件信息

import os
import json
import h5py
import utils
import jieba
import pickle
import numpy as np
import keras.preprocessing.text

from gensim.models import Word2Vec
from sklearn.metrics import roc_curve auc
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense Input LSTM embedding Dropout
from keras.layers.merge import concatenate
from keras.layers.wrappers import Bidirectional
from keras.models import Modelmodel_from_jsonload_model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping ModelCheckpoint

def get_tokenizer():
    with open(‘tok_expend.pkl‘‘rb‘) as f:
        tokenizer=pickle.load(f)

    word_index = tokenizer.word_index
    nb_words=min(700000len(word_index)+1)
    return nb_wordsword_index

def get_data():

    data_1=np.load(‘E:/2018泰迪杯/LSTM/data/data_1_s.npy‘)
    data_2=np.load(‘E:/2018泰迪杯/LSTM/data/data_2_s.npy‘)
    labels=np.load(‘E:/2018泰迪杯/LSTM/data/labels_s.npy‘)
    data_t_1=np.load(‘E:/2018泰迪杯/LSTM/data/data_t_1.npy‘)
    data_t_2=np.load(‘E:/2018泰迪杯/LSTM/data/data_t_2.npy‘)
    labels_t=np.load(‘E:/2018泰迪杯/LSTM/data/labels_t.npy‘)
    return data_1data_2labelsdata_t_1data_t_2labels_t

def train():
    num_lstm = 175
    num_dense = 100
    rate_drop_lstm = 0.15
    rate_drop_dense = 0.15
    embedDING_DIM = 100
    VALIDATION_SPLIT = 0.1
    act = ‘relu‘

    print(‘get_data‘)
    nb_wordsword_index=get_tokenizer()
    data_1data_2labelsdata_t_1data_t_2labels_t=get_data()

    print(‘get_model‘)
    word2vec = Word2Vec.load(‘E:/2018泰迪杯/数据/w2v_expend.mod‘)
    embedding_matrix = np.zeros((nb_words embedDING_DIM))
    for word i in word_index.items():
        if word in word2vec.wv.vocab:
            try:
                embedding_matrix[i] = word2vec.wv.word_vec(word)
            except:
                pass

    embedding_layer_1 = embedding(nb_words
                                embedDING_DIM
                                weights=[embedding_matrix]
                                input_length=30
                                trainable=False)
    embedding_layer_2 = embedding(nb_words
                                embedDING_DIM
                                weights=[embedding_matrix]
                                input_length=300
                                trainable=False)
    lstm_layer = LSTM(num_lstm dropout=rate_drop_lstm recurrent_dropout=rate_drop_lstm)

    sequence_1_input = Input(shape=(30) dtype=‘int32‘)
    embedded_sequences_1 = embedding_layer_1(sequence_1_input)
    y1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(300) dtype=‘int32‘)
    embedded_sequences_2 = embedding_layer_2(sequence_2_input)
    y2 = lstm_layer(embedded_sequences_2)

    merged = concatenate([y1 y2])
    merged = Dropout(rate_drop_dense)(merged)
    merg

评论

共有 条评论