• 大小: 5.84MB
    文件类型: .zip
    金币: 2
    下载: 1 次
    发布日期: 2023-09-26
  • 语言: Python
  • 标签: NLP  

资源简介

使用唐诗语料库,经过去噪预处理、分词、生成搭配、生成主题等过程,生成唐诗。基于Python。

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-

import os
import math
import time
import jieba
import codecs
import pickle
import random
import argparse

TIME_FORMAT = ‘%Y-%m-%d %H:%M:%S‘
base_FOLDER = os.path.abspath(os.path.dirname(__file__)).decode(‘gb2312‘)
DATA_FOLDER = os.path.join(base_FOLDER ‘data‘)
DEFAULT_FCOLLOCATIONS_V = os.path.join(DATA_FOLDER ‘collocations_v‘)
DEFAULT_FCOLLOCATIONS_H = os.path.join(DATA_FOLDER ‘collocations_h‘)
DEFAULT_FWORDS = os.path.join(DATA_FOLDER ‘words‘)
DEFAULT_FTOPIC_WORDS = os.path.join(DATA_FOLDER ‘topic_words‘)
DEFAULT_FSTART_WORDS = os.path.join(DATA_FOLDER ‘start_words.txt‘)
LOG_DELTA = 20

def read_dump(fin):
    fd = codecs.open(fin ‘rb‘)
    data = pickle.load(fd)
    fd.close()
    print(u‘Read from {} done.‘.format(fin))
    return data


def read_txt(fin):
    fd = codecs.open(fin ‘r‘ ‘utf-8‘)
    data = [i.strip() for i in fd]
    fd.close()
    print(u‘Read from {} done.‘.format(fin))
    return data


def generate_first_sentence_brute_force(start_word sentence_len topic_vector words):
    sentence = [start_word]
    l = len(start_word)
    avg = 1e-7
    while l < sentence_len:
        w2 = random.choice(words)
        if len(w2) > 2:
            continue
        if topic_vector[words.index(w2)] < avg:
            continue
        sentence.append(w2)
        l += len(w2)
    return sentence


# topic_words max: 3.1424917513724737
# topic_words min: 0.0
def generate_first_sentence(start_word sentence_len topic_vector words collocations_h):
    f = [dict() for i in range(sentence_len + 1)]
    p = [dict() for i in range(sentence_len + 1)]
    start_len = len(start_word)
    f[start_len][start_word] = topic_vector[words.index(start_word)] if start_word in words else 0
    p[start_len][start_word] = ‘‘
    for i in range(start_len sentence_len):
        for j in f[i]:
            if j not in collocations_h:
                continue
            topic_score = topic_vector[words.index(j)] if j in words else 0
            for k in collocations_h[j]:
                if i + k <= sentence_len:
                    for test_count in range(2):
                        (score w2) = random.choice(collocations_h[j][k])
                        temp = f[i][j] + math.log(score) + LOG_DELTA + topic_score
                        if w2 not in f[i + k] or temp > f[i + k][w2]:
                            f[i + k][w2] = temp
                            p[i + k][w2] = j
    ans = 0
    last_word = ‘‘
    if not f[sentence_len]:
        return generate_first_sentence_brute_force(start_word sentence_len topic_vector words)
    for j in f[sentence_len]:
        if f[sentence_len][j] > ans:
            ans = f[sentence_len][j]
            last_word = j
    if ans == 0:
        return generate_first_sentence_brute_force(start_word sentence_len topic_vector words)
    sentence = []
    i = sentence_len
    while i > 0:
        sentence.append(last_word)
        last_word i = p[i][last_word] i - len(last_word)
    return se

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2015-12-20 08:24  poem_generator\
     文件        2057  2015-12-20 08:24  poem_generator\preprocess.py
     文件        4130  2015-12-20 08:24  poem_generator\index.py
     文件        3576  2015-12-20 08:24  poem_generator\get_topic.py
     文件        2491  2015-12-20 08:24  poem_generator\get_start_words.py
     文件        4879  2015-12-20 08:24  poem_generator\get_collocations.py
     文件       10261  2015-12-20 08:24  poem_generator\generate_poem.py
     文件         465  2017-05-12 08:35  poem_generator\README.md
     文件       11358  2015-12-20 08:24  poem_generator\LICENSE
     文件         702  2015-12-20 08:24  poem_generator\.gitignore
     目录           0  2015-12-20 08:24  poem_generator\templates\
     文件        2285  2015-12-20 08:24  poem_generator\templates\index.htm
     目录           0  2015-12-20 08:24  poem_generator\data\
     文件    15597926  2015-12-20 08:24  poem_generator\data\唐诗语料库.txt

评论

共有 条评论