资源简介
使用唐诗语料库,经过去噪预处理、分词、生成搭配、生成主题等过程,生成唐诗。基于Python。
代码片段和文件信息
# -*- coding: utf-8 -*-
import os
import math
import time
import jieba
import codecs
import pickle
import random
import argparse
TIME_FORMAT = ‘%Y-%m-%d %H:%M:%S‘
base_FOLDER = os.path.abspath(os.path.dirname(__file__)).decode(‘gb2312‘)
DATA_FOLDER = os.path.join(base_FOLDER ‘data‘)
DEFAULT_FCOLLOCATIONS_V = os.path.join(DATA_FOLDER ‘collocations_v‘)
DEFAULT_FCOLLOCATIONS_H = os.path.join(DATA_FOLDER ‘collocations_h‘)
DEFAULT_FWORDS = os.path.join(DATA_FOLDER ‘words‘)
DEFAULT_FTOPIC_WORDS = os.path.join(DATA_FOLDER ‘topic_words‘)
DEFAULT_FSTART_WORDS = os.path.join(DATA_FOLDER ‘start_words.txt‘)
LOG_DELTA = 20
def read_dump(fin):
fd = codecs.open(fin ‘rb‘)
data = pickle.load(fd)
fd.close()
print(u‘Read from {} done.‘.format(fin))
return data
def read_txt(fin):
fd = codecs.open(fin ‘r‘ ‘utf-8‘)
data = [i.strip() for i in fd]
fd.close()
print(u‘Read from {} done.‘.format(fin))
return data
def generate_first_sentence_brute_force(start_word sentence_len topic_vector words):
sentence = [start_word]
l = len(start_word)
avg = 1e-7
while l < sentence_len:
w2 = random.choice(words)
if len(w2) > 2:
continue
if topic_vector[words.index(w2)] < avg:
continue
sentence.append(w2)
l += len(w2)
return sentence
# topic_words max: 3.1424917513724737
# topic_words min: 0.0
def generate_first_sentence(start_word sentence_len topic_vector words collocations_h):
f = [dict() for i in range(sentence_len + 1)]
p = [dict() for i in range(sentence_len + 1)]
start_len = len(start_word)
f[start_len][start_word] = topic_vector[words.index(start_word)] if start_word in words else 0
p[start_len][start_word] = ‘‘
for i in range(start_len sentence_len):
for j in f[i]:
if j not in collocations_h:
continue
topic_score = topic_vector[words.index(j)] if j in words else 0
for k in collocations_h[j]:
if i + k <= sentence_len:
for test_count in range(2):
(score w2) = random.choice(collocations_h[j][k])
temp = f[i][j] + math.log(score) + LOG_DELTA + topic_score
if w2 not in f[i + k] or temp > f[i + k][w2]:
f[i + k][w2] = temp
p[i + k][w2] = j
ans = 0
last_word = ‘‘
if not f[sentence_len]:
return generate_first_sentence_brute_force(start_word sentence_len topic_vector words)
for j in f[sentence_len]:
if f[sentence_len][j] > ans:
ans = f[sentence_len][j]
last_word = j
if ans == 0:
return generate_first_sentence_brute_force(start_word sentence_len topic_vector words)
sentence = []
i = sentence_len
while i > 0:
sentence.append(last_word)
last_word i = p[i][last_word] i - len(last_word)
return se
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2015-12-20 08:24 poem_generator\
文件 2057 2015-12-20 08:24 poem_generator\preprocess.py
文件 4130 2015-12-20 08:24 poem_generator\index.py
文件 3576 2015-12-20 08:24 poem_generator\get_topic.py
文件 2491 2015-12-20 08:24 poem_generator\get_start_words.py
文件 4879 2015-12-20 08:24 poem_generator\get_collocations.py
文件 10261 2015-12-20 08:24 poem_generator\generate_poem.py
文件 465 2017-05-12 08:35 poem_generator\README.md
文件 11358 2015-12-20 08:24 poem_generator\LICENSE
文件 702 2015-12-20 08:24 poem_generator\.gitignore
目录 0 2015-12-20 08:24 poem_generator\templates\
文件 2285 2015-12-20 08:24 poem_generator\templates\index.htm
目录 0 2015-12-20 08:24 poem_generator\data\
文件 15597926 2015-12-20 08:24 poem_generator\data\唐诗语料库.txt
相关资源
- 微博用户评论情感分析python代码数据
- Deep Learning for Natural Language Processing
- Python-神经网络模型能够从音频演讲中
-
me
tadata.txt - Python-NLP之旅包含NLP文章代码集锦
- Python-20182019校招春招秋招算法NLP深度
- python snownlp-0.12.3.tar.gz
- NLP实战之fasttext进行THUCNews文本分类
- python 自然语言处理实战代码部分
- DNN判断句子的通顺程度.py
- nlp肯定句与否定句判断
- NLPIR.user
- nltk语料库
- 基于lstm的语义相似度计算模型代码
- textrank自动文摘抽取python代码
- RasaCore官方文档中文版
- 2019传智播客python零基础入门视频教程
- NLP分词
- 基于SnowNLP的豆瓣评论情感分析及词云
- 正向最大匹配分词算法及KNN文本分类
- Image Caption 看图说话python代码
评论
共有 条评论