• 大小: 40.13MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-08-08
  • 语言: Python
  • 标签:

资源简介

一个非常简单的 BiLSTM-CRF 模型用于中文命名实体识别 (TensorFlow)

资源截图

代码片段和文件信息

import sys pickle os random
import numpy as np

## tags BIO
tag2label = {“O“: 0
             “B-PER“: 1 “I-PER“: 2
             “B-LOC“: 3 “I-LOC“: 4
             “B-ORG“: 5 “I-ORG“: 6
             }


def read_corpus(corpus_path):
    “““
    read corpus and return the list of samples
    :param corpus_path:
    :return: data
    “““
    data = []
    with open(corpus_path encoding=‘utf-8‘) as fr:
        lines = fr.readlines()
    sent_ tag_ = [] []
    for line in lines:
        if line != ‘\n‘:
            [char label] = line.strip().split()
            sent_.append(char)
            tag_.append(label)
        else:
            data.append((sent_ tag_))
            sent_ tag_ = [] []

    return data


def vocab_build(vocab_path corpus_path min_count):
    “““

    :param vocab_path:
    :param corpus_path:
    :param min_count:
    :return:
    “““
    data = read_corpus(corpus_path)
    word2id = {}
    for sent_ tag_ in data:
        for word in sent_:
            if word.isdigit():
                word = ‘
            elif (‘\u0041‘ <= word <=‘\u005a‘) or (‘\u0061‘ <= word <=‘\u007a‘):
                word = ‘
            if word not in word2id:
                word2id[word] = [len(word2id)+1 1]
            else:
                word2id[word][1] += 1
    low_freq_words = []
    for word [word_id word_freq] in word2id.items():
        if word_freq < min_count and word != ‘‘ and word != ‘‘:
            low_freq_words.append(word)
    for word in low_freq_words:
        del word2id[word]

    new_id = 1
    for word in word2id.keys():
        word2id[word] = new_id
        new_id += 1
    word2id[‘‘] = new_id
    word2id[‘‘] = 0

    print(len(word2id))
    with open(vocab_path ‘wb‘) as fw:
        pickle.dump(word2id fw)


def sentence2id(sent word2id):
    “““

    :param sent:
    :param word2id:
    :return:
    “““
    sentence_id = []
    for word in sent:
        if word.isdigit():
            word = ‘
        elif (‘\u0041‘ <= word <= ‘\u005a‘) or (‘\u0061‘ <= word <= ‘\u007a‘):
            word = ‘
        if word not in word2id:
            word = ‘
        sentence_id.append(word2id[word])
    return sentence_id


def read_dictionary(vocab_path):
    “““

    :param vocab_path:
    :return:
    “““
    vocab_path = os.path.join(vocab_path)
    with open(vocab_path ‘rb‘) as fr:
        word2id = pickle.load(fr)
    print(‘vocab_size:‘ len(word2id))
    return word2id


def random_embedding(vocab embedding_dim):
    “““

    :param vocab:
    :param embedding_dim:
    :return:
    “““
    embedding_mat = np.random.uniform(-0.25 0.25 (len(vocab) embedding_dim))
    embedding_mat = np.float32(embedding_mat)
    return embedding_mat


def pad_sequences(sequences pad_mark=0):
    “““

    :param sequences:
    :param pad_mark:
    :return:
    “““
    max_len = max(map(lambda x : len(x) sequences))
    seq_list seq_len_list = [] []
    for seq in seque

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-03-16 13:58  zh-NER-TF-master\
     文件          28  2019-03-16 13:58  zh-NER-TF-master\.gitignore
     文件        4103  2019-03-16 13:58  zh-NER-TF-master\README.md
     文件       12732  2019-03-16 13:58  zh-NER-TF-master\conlleval_rev.pl
     文件        3824  2019-03-16 13:58  zh-NER-TF-master\data.py
     目录           0  2019-03-16 13:58  zh-NER-TF-master\data_path\
     目录           0  2019-03-16 13:58  zh-NER-TF-master\data_path\original\
     文件          49  2019-03-16 13:58  zh-NER-TF-master\data_path\original\link.txt
     文件      526458  2019-03-16 13:58  zh-NER-TF-master\data_path\original\test1.txt
     文件      577540  2019-03-16 13:58  zh-NER-TF-master\data_path\original\testright1.txt
     文件    10480443  2019-03-16 13:58  zh-NER-TF-master\data_path\original\train1.txt
     文件     1114268  2019-03-16 13:58  zh-NER-TF-master\data_path\test_data
     文件    13904440  2019-03-16 13:58  zh-NER-TF-master\data_path\train_data
     文件       61479  2019-03-16 13:58  zh-NER-TF-master\data_path\word2id.pkl
     目录           0  2019-03-16 13:58  zh-NER-TF-master\data_path_save\
     目录           0  2019-03-16 13:58  zh-NER-TF-master\data_path_save\1521112368\
     目录           0  2019-03-16 13:58  zh-NER-TF-master\data_path_save\1521112368\checkpoints\
     文件          79  2019-03-16 13:58  zh-NER-TF-master\data_path_save\1521112368\checkpoints\checkpoint
     文件    31417884  2019-03-16 13:58  zh-NER-TF-master\data_path_save\1521112368\checkpoints\model-31680.data-00000-of-00001
     文件        1215  2019-03-16 13:58  zh-NER-TF-master\data_path_save\1521112368\checkpoints\model-31680.index
     文件     5306570  2019-03-16 13:58  zh-NER-TF-master\data_path_save\1521112368\checkpoints\model-31680.meta
     文件         778  2019-03-16 13:58  zh-NER-TF-master\eval.py
     文件        5605  2019-03-16 13:58  zh-NER-TF-master\main.py
     文件       12572  2019-03-16 13:58  zh-NER-TF-master\model.py
     目录           0  2019-03-16 13:58  zh-NER-TF-master\pics\
     文件         961  2019-03-16 13:58  zh-NER-TF-master\pics\demo.txt
     文件      786270  2019-03-16 13:58  zh-NER-TF-master\pics\pic1.png
     文件      291153  2019-03-16 13:58  zh-NER-TF-master\pics\pic2.png
     文件        2814  2019-03-16 13:58  zh-NER-TF-master\utils.py

评论

共有 条评论