资源简介
使用BiLSTM CRF分词模型,在SIGHAN Microsoft Research数据集上进行中文分词的训练和测试。
运行方法可在readme看到,同时有详细报告描述
【源码目录】
中山大学_中文分词
├── readme.md
├── 代码
│ ├── cws.py
│ ├── msr_test.utf8
│ ├── msr_test_gold.utf8
│ ├── msr_training.utf8
│ ├── result.txt
│ ├── test_score.py
│ ├── train.py
│ └── train_result.pkl
├── 中文报告.docx
└── 英文报告.docx
1 directory, 11 files
代码片段和文件信息
import codecs torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from time import sleep
torch.cuda.set_device(1) #根据服务器直接设定即可
def text(filename data):
file = open(filename ‘w‘)
for i in data: file.write(i + ‘\n‘)
file.close()
class BiLSTM_CRF(nn.Module):
def __init__(self vocab_size tag_to_ix embedding_dim hidden_dim):
super(BiLSTM_CRF self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
self.tagset_size = len(tag_to_ix)
self.word_embeds = nn.embedding(vocab_size embedding_dim)
self.lstm = nn.LSTM(embedding_dim hidden_dim // 2 num_layers=1 bidirectional=True batch_first=True)
self.hidden2tag = nn.Linear(hidden_dim self.tagset_size)
self.transitions = nn.Parameter(torch.randn(self.tagset_size self.tagset_size))
self.transitions.data[tag_to_ix[START_TAG] :] = -10000
self.transitions.data[: tag_to_ix[STOP_TAG]] = -10000
self.hidden = self.init_hidden()
def init_hidden(self):
return (torch.randn(2 1 self.hidden_dim // 2).to(device)
torch.randn(2 1 self.hidden_dim // 2).to(device))
def _get_lstm_features_test(self sentence):
self.hidden = self.init_hidden()
embeds = self.word_embeds(sentence).unsqueeze(dim=0)
lstm_out self.hidden = self.lstm(embeds)
lstm_out = lstm_out.squeeze()
lstm_feats = self.hidden2tag(lstm_out)
if len(sentence) == 1: lstm_feats = lstm_feats.unsqueeze(0)
return lstm_feats
def _forward_alg(self feats):
init_alphas = torch.full([feats.shape[0] self.tagset_size] -10000.)
init_alphas[: self.tag_to_ix[START_TAG]] = 0.
forward_var_list = []
forward_var_list.append(init_alphas)
for feat_index in range(feats.shape[1]):
gamar_r_l = torch.stack([forward_var_list[feat_index]] * feats.shape[2]).transpose(0 1)
t_r1_k = torch.unsqueeze(feats[: feat_index :] 1).transpose(1 2)
aa = gamar_r_l.to(device) + t_r1_k.to(device) + torch.unsqueeze(self.transitions 0)
forward_var_list.append(torch.logsumexp(aa dim=2))
terminal_var = forward_var_list[-1] + self.transitions[self.tag_to_ix[STOP_TAG]].repeat(
[feats.shape[0] 1])
alpha = torch.logsumexp(terminal_var dim=1)
return alpha
def _get_lstm_features(self sentence):
self.hidden = self.init_hidden()
embeds = self.word_embeds(sentence)
lstm_out self.hidden = self.lstm(embeds)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats
def _score_sentence(self feats tags):
score = torch.zeros(tags.shape[0]).to(device)
tags = torch.cat([torch.full([tags.shape[0] 1] self.tag_to_ix[START_TAG]).
相关资源
- python合并多个mp4视频文件成一个mp4文
- Python其它开发工具的安装与使用.ppt
- Computer Vision with Python 3
- python入门全套PPT
- a*算法的python版
- python爬虫爬取微博热搜
- python爬虫爬取旅游信息(附源码,c
- python爬虫爬取豆瓣电影信息
- 网页遥控小车 Python web (基于RPi.GPI
- python 采集360的图片地址到文本文件
- Python简单小游戏 五子棋
- python基础题库(附答案).docx(共54页
- Python RC4算法
- python实现的日历
- Python源代码:以web方式管理自己的常
- 基于Python实现的简易画气球
- python 井字棋 游戏源码
- 《Python从小白到大牛》源代码
- Python爬虫实战入门教程
- 机器学习numpy和pandas基础
- Python 3 Web Development. Beginners Guide
- Python爬取小说
- 网络爬虫(pachong_anjuke.py)
- python demo百度文库.py
- Python总结(精简).doc
- Supervised Learning with Python
- python实现一个简单的名片管理系统功
- Python源码剖析.pdf59505
- python语言实现的基于opencv的表针识别
- 基于Python的酒店管理系统
评论
共有 条评论