资源简介
从最基础的统计方法到前沿的应用深度学习、强化学习的文档摘要方法。还包括性能优化策略。(附:开源代码)
代码片段和文件信息
from nltk.tokenize import sent_tokenize word_tokenize
from nltk.corpus import stopwords
import math
from itertools import product count
from string import punctuation
from heapq import nlargest
stopwords = set(stopwords.words(‘english‘) + list(punctuation))
def calculate_similarity(sen1 sen2):
counter = 0
for word in sen1:
if word in sen2:
counter += 1
return counter / (math.log((len(sen1)+1)) + math.log((len(sen2)+1))+1)
def create_graph(word_sent):
num = len(word_sent)
board = [[0.0 for _ in range(num)] for _ in range(num)]
for i j in product(range(num) repeat=2):
if i != j:
board[i][j] = calculate_similarity(word_sent[i] word_sent[j])
return board
def Summarize(textn):
# 首先分出句子
sents = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sents]
# 把停用词去除
for i in range(len(word_sent)):
for word in word_sent[i]:
if word in stopwords:
word_sent[i].remove(word)
similarity_graph = create_graph(word_sent)
scores = weighted_pagerank(similarity_graph)
sent_selected = nlargest(n zip(scores count()))
sent_index = []
for i in range(n):
sent_index.append(sent_selected[i][1])
return [sents[i] for i in sent_index]
def different(scores old_scores):
flag = False
for i in range(len(scores)):
if math.fabs(scores[i] - old_scores[i]) >= 0.0001:
flag = True
break
return flag
def weighted_pagerank(weight_graph):
# 把初始的分数值设置为0.5
scores = [0.5 for _ in range(len(weight_graph))]
old_scores = [0.0 for _ in range(len(weight_graph))]
while different(scores old_scores):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
for i in range(len(weight_graph)):
scores[i] = calculate_score(weight_graph scores i)
return scores
def calculate_score(weight_graph scores i):
length = len(weight_graph)
d = 0.85
added_score = 0.0
# 先计算分子
for j in range(length):
fraction = 0.0
denominator = 0.0
fraction = weight_graph[j][i] * scores[j]
for k in range(length):
denominator += weight_graph[j][k]
added_score += fraction / (denominator+1.0)
weighted_score = (1 - d) + d * added_score
return weighted_score
if __name__ == ‘__main__‘:
file_output=r“predtextank.txt“
output=open(file_output“w“encoding=“utf8“)
with open(“srcT-val.txt“ “r“encoding=“utf8“) as myfile:
text = myfile.readlines()
for t in text:
t=t.replace(‘\n‘ ‘‘)
summary = Summarize(t 1)
line=“ “.join(summary)+“\n“
output.write(line)
output.close()
from snownlp import SnowNLP
with open(r“D:\Tensorflow\nlp\bytedance\srcT-val.txt“ “r“ encoding=“u
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2019-01-16 18:22 文档摘要解决方案\
文件 460182 2019-01-16 18:21 文档摘要解决方案\A DEEP REINFORCED MODEL FOR ABSTRACTIVE SUMMARIZATION.pdf
文件 55446 2018-12-24 19:44 文档摘要解决方案\A-DEEP-REINFORCED-MODEL-FOR-ABSTRACTIVE-SUMMARIZATION-master.zip
文件 391109 2019-01-16 18:17 文档摘要解决方案\Abstractive Text Summarization using Sequence-to-sequence RNNs and Beyond.pdf
文件 442103 2019-01-16 18:16 文档摘要解决方案\Get To The Point Summarization with Pointer-Generator Networks.pdf
文件 987202 2018-10-15 22:22 文档摘要解决方案\Neural Response Generation with Dynamic Vocabularies.pdf
文件 3631 2019-01-16 18:08 文档摘要解决方案\NewsSummary.py
文件 784434 2019-01-16 18:04 文档摘要解决方案\NON-AUTOREGRESSIVE NEURAL MACHINE TRANSLATION.pdf
文件 161978 2019-01-16 18:09 文档摘要解决方案\OpenNMT Neural Machine Translation Toolkit.pdf
文件 417 2019-01-16 18:22 文档摘要解决方案\readme.txt
文件 769595 2019-01-16 18:17 文档摘要解决方案\Sequential Copying Networks.pdf
文件 1815148 2018-12-28 19:43 文档摘要解决方案\srcT-val.txt
相关资源
- Video Game Optimization
- .NET 性能优化一书代码和电子版pdf英文
- 系统性能优化(第二版)
- Spark大数据处理 技术 应用与性能优化
- 《Spark大数据处理 技术、应用与性能
- 稀疏矩阵LU分解在GPU上的性能优化
- 无线充电系统的研制及性能优化
- iOS和macOS性能优化:Cocoa、Cocoa Touch、
- 前端性能优化原理与实践.zip
- Linux性能优化实战(PDF系列).zip
- 并行算法设计与性能优化.pdf
- 《实用负载均衡技术网站性能优化攻
- 标题生成神器
- 并行算法设计与性能优化.zip
- 大型网站性能优化实战从前端网络
- CSAPP性能优化实验
- 刘文志--并行算法设计与性能优化
- 让你页面速度飞起来 Web前端性能优化
- 爆款标题生成器
- ics lab8 perflab性能优化实验
- 关键词标题生成器
- 移动H5前端性能优化指南
- oralce性能优化.rar
- 居住区空间防震避难性能优化的城市
- mybatis一对多性能优化demo
- 前端性能优化yahoo前端性能团队总结的
- 淘宝宝贝标题生成器 淘宝商品标题生
评论
共有 条评论