• 大小: 37.02MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-06-28
  • 语言: Python
  • 标签:

资源简介

主要实现使用了基于字向量的四层双向LSTM与CRF模型的网络.该项目提供了原始训练数据样本(一般醒目,出院情况,病史情况,病史特点,诊疗经过)与转换版本,训练脚本,预训练模型,可用于序列标注研究.把玩和PK使用

资源截图

代码片段和文件信息

#!/usr/bin/env python3
# coding: utf-8
# File: lstm_predict.py
# Author: lhy
# Date: 18-5-23

import numpy as np
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequentialload_model
from keras.layers import embedding Bidirectional LSTM Dense TimeDistributed Dropout
from keras_contrib.layers.crf import CRF
import matplotlib.pyplot as plt
import os

os.environ[‘TF_CPP_MIN_LOG_LEVEL‘] = ‘2‘

class LSTMNER:
    def __init__(self):
        cur = ‘/‘.join(os.path.abspath(__file__).split(‘/‘)[:-1])
        self.train_path = os.path.join(cur ‘data/train.txt‘)
        self.vocab_path = os.path.join(cur ‘model/vocab.txt‘)
        self.embedding_file = os.path.join(cur ‘model/token_vec_300.bin‘)
        self.model_path = os.path.join(cur ‘model/tokenvec_bilstm2_crf_model_20.h5‘)
        self.word_dict = self.load_worddict()
        self.class_dict ={
                         ‘O‘:0
                         ‘TREATMENT-I‘: 1
                         ‘TREATMENT-B‘: 2
                         ‘BODY-B‘: 3
                         ‘BODY-I‘: 4
                         ‘SIGNS-I‘: 5
                         ‘SIGNS-B‘: 6
                         ‘CHECK-B‘: 7
                         ‘CHECK-I‘: 8
                         ‘DISEASE-I‘: 9
                         ‘DISEASE-B‘: 10
                        }
        self.label_dict = {j:i for ij in self.class_dict.items()}
        self.embedDING_DIM = 300
        self.EPOCHS = 10
        self.BATCH_SIZE = 128
        self.NUM_CLASSES = len(self.class_dict)
        self.VOCAB_SIZE = len(self.word_dict)
        self.TIME_STAMPS = 150
        self.embedding_matrix = self.build_embedding_matrix()
        self.model = self.tokenvec_bilstm2_crf_model()
        self.model.load_weights(self.model_path)

    ‘加载词表‘
    def load_worddict(self):
        vocabs = [line.strip() for line in open(self.vocab_path)]
        word_dict = {wd: index for index wd in enumerate(vocabs)}
        return word_dict

    ‘‘‘构造输入,转换成所需形式‘‘‘
    def build_input(self text):
        x = []
        for char in text:
            if char not in self.word_dict:
                char = ‘UNK‘
            x.append(self.word_dict.get(char))
        x = pad_sequences([x] self.TIME_STAMPS)
        return x

    def predict(self text):
        str = self.build_input(text)
        raw = self.model.predict(str)[0][-self.TIME_STAMPS:]
        result = [np.argmax(row) for row in raw]
        chars = [i for i in text]
        tags = [self.label_dict[i] for i in result][len(result)-len(text):]
        res = list(zip(chars tags))
        print(res)
        return res

    ‘‘‘加载预训练词向量‘‘‘
    def load_pretrained_embedding(self):
        embeddings_dict = {}
        with open(self.embedding_file ‘r‘) as f:
            for line in f:
                values = line.strip().split(‘ ‘)
                if len(values) < 300:
                    cont

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2018-12-15 05:13  MedicalNamedEntityRecognition-master\
     目录           0  2018-12-15 05:13  MedicalNamedEntityRecognition-master\.idea\
     文件         459  2018-12-15 05:13  MedicalNamedEntityRecognition-master\.idea\illness_entity_recognize.iml
     文件         212  2018-12-15 05:13  MedicalNamedEntityRecognition-master\.idea\misc.xml
     文件         300  2018-12-15 05:13  MedicalNamedEntityRecognition-master\.idea\modules.xml
     文件       17622  2018-12-15 05:13  MedicalNamedEntityRecognition-master\.idea\workspace.xml
     文件        7140  2018-12-15 05:13  MedicalNamedEntityRecognition-master\README.md
     目录           0  2018-12-15 05:13  MedicalNamedEntityRecognition-master\data\
     文件     2022512  2018-12-15 05:13  MedicalNamedEntityRecognition-master\data\train.txt
     文件     1452349  2018-12-15 05:13  MedicalNamedEntityRecognition-master\data_origin.zip
     文件        1213  2018-12-15 05:13  MedicalNamedEntityRecognition-master\length_distribution.txt
     文件        4857  2018-12-15 05:13  MedicalNamedEntityRecognition-master\lstm_predict.py
     文件        6869  2018-12-15 05:13  MedicalNamedEntityRecognition-master\lstm_train.py
     目录           0  2018-12-15 05:13  MedicalNamedEntityRecognition-master\model\
     文件    70849895  2018-12-15 05:13  MedicalNamedEntityRecognition-master\model\token_vec_300.bin
     文件     9438552  2018-12-15 05:13  MedicalNamedEntityRecognition-master\model\tokenvec_bilstm2_crf_model_20.h5
     文件        6881  2018-12-15 05:13  MedicalNamedEntityRecognition-master\model\vocab.txt
     文件        2604  2018-12-15 05:13  MedicalNamedEntityRecognition-master\transfer_data.py

评论

共有 条评论