资源简介
其中thulac用于分词,gensim用于word2vec 这两个库只用于第一篇文章的分析。如果您只关心如何构建诗人关系网络,那么不需要安装这个两个库。
代码片段和文件信息
import pickle
import argparse
import os
from collections import Counter defaultdict
from utils import read_qts get_alter_names_from_CBDB
# TODO 补充著名诗人列表
# 這些詩人在CBDB的重名难以轻易排除,手動查找其在BIOG_MAIN表中的ID
# 注意CBDB使用的是繁體中文
manual_defuzzy_authors_id = {
‘李林甫‘: 32534 ‘王建‘: 92047
‘李賀‘: 93012 ‘張繼‘: 93495
‘張旭‘: 93409 ‘李紳‘: 92982}
# 手動刪除某些作者
mannual_deleted_authors = set([‘無作‘ ‘清江‘])
# 手動刪除作者的某些別稱,这些别称在唐诗中是常用字
mannual_deleted_alter_names = {‘李林甫‘: set([‘李十‘])
‘李益‘: set([‘李十‘])
‘李世民‘: set([‘李二‘])
‘李嘉祐‘: set([‘李二‘])
‘馬湘‘: set([‘自然‘])
‘高駢‘: set([‘千里‘])
‘孟浩然‘: set([‘浩然‘])
‘李白‘: set([‘太白‘])
‘黃巢‘: set([‘皇帝‘])
‘眉娘‘: set([‘逍遙‘])}
# 補充CBDB中缺少的部分作者別稱
mannual_added_alter_names = {
‘李建‘: set([‘李十一‘])
‘劉禹錫‘: set([‘劉二十八‘])
}
def get_alter_names(qts_file cbdb_file save_dir):
alter_names_file = os.path.join(save_dir “alternames.pkl“)
if os.path.exists(alter_names_file):
print(“find dumped alternames file loading directly.“)
with open(alter_names_file ‘rb‘) as f:
qts_list authors_filtered_by_CBDB alter_names_dict = pickle.load(f)
else:
print(“processing QuanTangShi...“)
# 读取全唐诗,并存储诗歌内容和作者
qts_list authors_set = read_qts(qts_file)
# 删除部分作者
authors_set -= mannual_deleted_authors
alter_names_dict authors_filtered_by_CBDB = get_alter_names_from_CBDB(cbdb_file authors_set
manual_defuzzy_authors_id)
# 刪除不想要的別稱
for k v in mannual_deleted_alter_names.items():
alter_names_dict[k] -= v
# 補充CBDB中缺少的別稱
for k v in mannual_added_alter_names.items():
alter_names_dict[k] |= v
# 存储计算结果
with open(alter_names_file ‘wb‘) as f:
pickle.dump([qts_list authors_filtered_by_CBDB alter_names_dict] f)
return qts_list authors_filtered_by_CBDB alter_names_dict
def get_refer_relations(qts_list authors_filtered_by_CBDB alter_names_dict save_dir):
reference_relations_file = os.path.join(save_dir ‘reference_relations.pkl‘)
if os.path.exists(reference_relations_file):
print(“find dumped reference relations file skip calculating.“)
return
else:
print(“calculating reference relations...“)
reference_relations_counter = Counter()
reference_relations_text = defaultdict(list)
# 逐个作者搜寻
for name in authors_filtered_by_CBDB:
# 逐首诗搜寻
for author title text in qts_list:
# 如果不在CBDB过滤过的set中,直接跳过
if author not in authors_filtered_by_CBDB:
continue
poem = title + ‘ ‘ + text
# 查找本名,标题加正文中只要出现一次名字就可以
if poem.find(name) != -1:
reference_relations_counter[(author name)] += 1
reference_relations_text[(author name)].append(title)
continue
# 查找别名
alt_names = alter_names_dict[name]
for alt_name in alt_names:
if poem.find(alt_name) != -1:
reference_relations_counter[(author name)] += 1
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-04-01 13:04 poetry_analyzer-master\
文件 118 2017-04-01 13:04 poetry_analyzer-master\.gitignore
文件 2912 2017-04-01 13:04 poetry_analyzer-master\README.md
文件 4562 2017-04-01 13:04 poetry_analyzer-master\construct_poets_network.py
目录 0 2017-04-01 13:04 poetry_analyzer-master\data\
文件 206 2017-04-01 13:04 poetry_analyzer-master\data\early_tang_poets.txt
文件 393 2017-04-01 13:04 poetry_analyzer-master\data\high_tang_poets.txt
文件 390 2017-04-01 13:04 poetry_analyzer-master\data\late_tang_poets.txt
文件 418 2017-04-01 13:04 poetry_analyzer-master\data\middle_tang_poets.txt
文件 10877380 2017-04-01 13:04 poetry_analyzer-master\data\qts_zhs.txt
文件 10877377 2017-04-01 13:04 poetry_analyzer-master\data\qts_zht.txt
目录 0 2017-04-01 13:04 poetry_analyzer-master\html\
文件 3456 2017-04-01 13:04 poetry_analyzer-master\html\early_tang_poets_net.html
文件 577370 2017-04-01 13:04 poetry_analyzer-master\html\echarts-all-3.js
文件 10836 2017-04-01 13:04 poetry_analyzer-master\html\full_tang_poets_net.html
文件 5476 2017-04-01 13:04 poetry_analyzer-master\html\high_tang_poets_net.html
文件 962 2017-04-01 13:04 poetry_analyzer-master\html\html_head.txt
文件 405 2017-04-01 13:04 poetry_analyzer-master\html\html_tail.txt
文件 4879 2017-04-01 13:04 poetry_analyzer-master\html\late_tang_poets_net.html
文件 6619 2017-04-01 13:04 poetry_analyzer-master\html\middle_tang_poets_net.html
文件 3682 2017-04-01 13:04 poetry_analyzer-master\utils.py
文件 5385 2017-04-01 13:04 poetry_analyzer-master\visualize_poets_network.py
文件 5311 2017-04-01 13:04 poetry_analyzer-master\word_level_analyzer.py
评论
共有 条评论