资源简介
基于深度学习的文本相似度计算模型和代码,亲自跑过可以直接使用,对nlp领域的学习非常有借鉴意义,在智能问答系统上经常会用到。
代码片段和文件信息
# !/usr/bin/env python
# -*- coding:utf-8 _*-
“““
@Author:yanqiang
@File: build_input.py
@Time: 2018/11/30 17:41
@Software: PyCharm
@Description: 构建模型的输入
“““
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from gensim.models import Word2Vec
# train = load_atec()
# train dev test=load_ccks()
def select_best_length(trainlimit_ratio=0.95):
“““
根据数据集的句子长度,选择最佳的样本max-length
:param limit_ratio:句子长度覆盖度,默认覆盖95%以上的句子
:return:
“““
len_list = []
max_length = 0
cover_rate = 0.0
for q1 q2 in zip(train[‘q1‘] train[‘q2‘]):
len_list.append(len(q1))
len_list.append(len(q2))
all_sent = len(len_list)
sum_length = 0
len_dict = Counter(len_list).most_common()
for i in len_dict:
sum_length += i[1] * i[0]
average_length = sum_length / all_sent
for i in len_dict:
rate = i[1] / all_sent
cover_rate += rate
if cover_rate >= limit_ratio:
max_length = i[0]
break
print(‘average_length:‘ average_length)
print(‘max_length:‘ max_length)
return max_length
# select_best_length()
#返回train_xy
def build_data(train):
“““
构建数据集
:return:
“““
#遍历每一个样本,获取样本的问题q1的样本集合list
sample_x_left = train.q1.apply(lambda x: [char for char in x if char]).tolist()
# 遍历每一个样本,获取样本的问题q2的样本集合list
sample_x_right = train.q2.apply(lambda x: [char for char in x if char]).tolist()
vocabs = {‘UNK‘}
#构建词汇表
for x_left x_right in zip(sample_x_left sample_x_right):
for char in x_left + x_right:
vocabs.add(char)
sample_x = [sample_x_left sample_x_right]
sample_y = train.label.tolist()
print(len(sample_x_left) len(sample_x_right))
datas = [sample_x sample_y]
#{‘这‘: 0 ‘纯‘: 1 ‘代‘: 2 ‘万‘: 3 ‘(‘: 4 ‘柳‘: 5 ‘扮‘: 6 ‘翻‘: 7 ‘水‘: 8................}
word_dict = {wd: index for index wd in enumerate(list(vocabs))}
#print(word_dict)
vocab_path = ‘model/vocab.txt‘
with open(vocab_path ‘w‘ encoding=‘utf-8‘) as f:
f.write(‘\n‘.join(list(vocabs)))
return datas word_dict
def convert_data(datas word_dict MAX_LENGTH):
“““
将数据转换成keras所能处理的格式
:return:
“““
sample_x = datas[0]
sample_y = datas[1]
sample_x_left = sample_x[0]
sample_x_right = sample_x[1]
left_x_train = [[word_dict[char] for char in data] for data in sample_x_left]
right_x_train = [[word_dict[char] for char in data] for data in sample_x_right]
y_train = [int(i) for i in sample_y]
left_x_train = pad_sequences(left_x_train MAX_LENGTH padding=‘pre‘)
right_x_train = pad_sequences(right_x_train MAX_LENGTH padding=‘pre‘)
y_train = np.expand_dims(y_train 2)
return left_x_train right_x_train y_train
def train_w2v(datas):
“““
训练词向量
:return:
“““
sents = datas[0][0] + datas[0][1]
#print(sents)
model = Word2Vec(sentences=sents size=300 min_
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 6148 2020-02-26 18:01 sentence-similarity-project\.DS_Store
文件 141 2018-12-05 17:08 sentence-similarity-project\.gitignore
文件 128 2020-03-28 21:48 sentence-similarity-project\.idea\libraries\R_User_Library.xm
文件 315 2020-03-28 21:46 sentence-similarity-project\.idea\misc.xm
文件 313 2020-03-28 21:46 sentence-similarity-project\.idea\modules.xm
文件 611 2020-03-28 21:48 sentence-similarity-project\.idea\sentence-similarity-project.iml
文件 16934 2020-03-29 15:07 sentence-similarity-project\.idea\workspace.xm
文件 4702 2020-03-28 23:25 sentence-similarity-project\build_input.py
文件 1753 2020-03-05 21:25 sentence-similarity-project\data_loader.py
文件 1780 2020-03-05 21:40 sentence-similarity-project\evalute.py
文件 3485318 2020-03-05 21:23 sentence-similarity-project\input\atec\atec_nlp_sim_train.csv
文件 5625804 2018-12-05 17:08 sentence-similarity-project\input\atec\atec_nlp_sim_train_add.csv
文件 946 2018-12-05 17:08 sentence-similarity-project\input\atec\readme.txt
文件 609 2018-12-05 17:08 sentence-similarity-project\input\ccks\Readme
文件 760958 2018-12-05 17:08 sentence-similarity-project\input\ccks\task3_dev.txt
文件 7355965 2018-12-05 17:08 sentence-similarity-project\input\ccks\task3_train.txt
文件 8555401 2018-12-05 17:08 sentence-similarity-project\input\ccks\test_with_id.txt
文件 23854 2020-03-29 12:06 sentence-similarity-project\model\model.png
文件 29593 2020-03-29 12:27 sentence-similarity-project\model\result_atec.png
文件 25260 2018-12-05 17:08 sentence-similarity-project\model\result_ccks.png
文件 8809848 2020-03-29 12:27 sentence-similarity-project\model\tokenvec_bilstm2_siamese_model.h5
文件 7847540 2020-03-29 12:06 sentence-similarity-project\model\token_vec_300.bin
文件 10735 2020-03-29 12:06 sentence-similarity-project\model\vocab.txt
文件 4329 2020-03-29 12:06 sentence-similarity-project\train_siamese_network.py
文件 5003 2020-03-28 23:25 sentence-similarity-project\__pycache__\build_input.cpython-36.pyc
文件 1481 2020-03-28 21:48 sentence-similarity-project\__pycache__\data_loader.cpython-36.pyc
文件 175767 2020-02-26 18:00 sentence-similarity-project\文本相似度建模.pdf
目录 0 2020-03-28 21:48 sentence-similarity-project\.idea\inspectionProfiles
目录 0 2020-03-28 21:48 sentence-similarity-project\.idea\libraries
目录 0 2020-03-05 21:23 sentence-similarity-project\input\atec
............此处省略9个文件信息
相关资源
- 全功能智能车之CCD ADC触发DMA传输
- 基于YOLO神经网络的实时车辆检测代码
- AI圣经《深度学习中文版》 高清.pdf版
- eyeriss项目组的深度学习加速器的总结
- 智能arduino小车源程序
- The Book of Why: The New Science of Cause and
- 深度学习word2vector测试语料text8
- 基于STM32F4VET6开发板的智能语音识别系
- 深度学习高清中文完整版 PDF
- Scikit-Learn与TensorFlow机器学习实用指南
- 神经网络与深度学习(中文+英文原版
- 智能家居手机APP远程智能控制家居设
- 吴恩达深度学习deeplearning第五课第一
- 基于移动互联网大数据挖掘的智能精
- [免费完整版]Neural Networks Tricks of the
- 知识图谱与认知智能
- Smart Grid using Big Data Analytics A Random M
- 人工智能马少平,朱小燕
- 基于opencv的远程视频监控智能小车
- 智能无线遥控门铃设计内含原理图、
- Bishop《Pattern Recognition and Machine Learn
- 游戏开发中的人工智能(中文).pdf
- keras实现歌词的自动生成 所需的歌词
- Amlogic S905开发资料
- 智能车K60摄像头组全套代码
- 无线视频遥控的WIFI智能小车
- 百面*机器学习pdf-高清版-带书签
- The HundredPage Machine Learning Book.pdf
- 米家智能摄像机云台版固件及升级操
- 《超智能体》
评论
共有 条评论