• 大小: 11.63MB
    文件类型: .zip
    金币: 2
    下载: 0 次
    发布日期: 2024-02-03
  • 语言: 其他
  • 标签: 中文分词  

资源简介

基于双向LSTM/keras/tensorflow的中文分词,语料为人民日报,分词准确率高达97%

资源截图

代码片段和文件信息

# import os
# os.environ[“CUDA_DEVICE_ORDER“] = “PCI_BUS_ID“
# os.environ[“CUDA_VISIBLE_DEVICES“] = ““
from keras.models import model_from_json
from keras import backend as K
from DataProcessing.embedding import Data
import numpy as np
import json
import os

transpose_matrix = {‘be‘: 0.5
                    ‘bm‘: 0.5
                    ‘eb‘: 0.5
                    ‘es‘: 0.5
                    ‘me‘: 0.5
                    ‘mm‘: 0.5
                    ‘sb‘: 0.5
                    ‘ss‘: 0.3}

zy = {i: np.log(transpose_matrix[i]) for i in transpose_matrix.keys()}


def viterbi(nodes):
    path = {‘b‘: nodes[0][‘b‘] ‘s‘: nodes[0][‘s‘]}
    for layer_num in range(1 len(nodes)):
        old_path = path.copy()
        path = {}
        for new_tag in nodes[layer_num].keys():
            tmp = {}
            if layer_num == len(nodes) - 1:
                if new_tag in [“m“ “b“]:
                    continue
            for old_path_tag in old_path.keys():
                if old_path_tag[-1]+new_tag in transpose_matrix.keys():
                    tmp[old_path_tag+new_tag] = old_path[old_path_tag] + nodes[layer_num][new_tag] + transpose_matrix[old_path_tag[-1]+new_tag]
            k = np.argmax(list(tmp.values()))
            path[list(tmp.keys())[k]] = list(tmp.values())[k]
        # print(path)
    return list(path.keys())[np.argmax(list(path.values()))]

def cut(asentence tagstr):
    result = ‘‘
    for (character tag) in zip(asentence tagstr):
        result += character
        if str(tag) in [‘s‘ ‘e‘]:
            result += ‘ ‘
    return result


if __name__ == ‘__main__‘:
    filedir = “D:\\codes\\python\\keras_splitwords\\datas“
    model_name = “model.json“
    weights_name = “model_weights.h5“
    test_sentence = ““

    data = Data(filedir)
    charsets = data.load_charsets_from_file(“charsets.pkl“)
    model = model_from_json(json.load(open(os.path.join(filedir model_name) “r“)))
    model.load_weights(os.path.join(filedir weights_name))

    sentence_embeddings all_sentences sentence_len = data.get_sent_embeddings(charsets test_sentence)
    # print(“sentence len:“ sentence_len)
    result = model.predict(sentence_embeddings verbose=False)

    print(test_sentence)
    for (aresult asentence length) in zip(result all_sentences sentence_len):
        aresult = aresult[:length]
        one_sentence_nodes = [{k: v for (k v) in zip([‘s‘ ‘b‘ ‘m‘ ‘e‘] np.log(i[:4]))} for i in aresult]
        print(cut(asentence viterbi(one_sentence_nodes)))

    K.clear_session()

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2018-05-16 04:53  Chinese-Word-Split-master\
     文件          12  2018-05-16 04:53  Chinese-Word-Split-master\.gitignore
     目录           0  2018-05-16 04:53  Chinese-Word-Split-master\DataProcessing\
     文件           0  2018-05-16 04:53  Chinese-Word-Split-master\DataProcessing\__init__.py
     文件        3143  2018-05-16 04:53  Chinese-Word-Split-master\DataProcessing\embedding.py
     目录           0  2018-05-16 04:53  Chinese-Word-Split-master\datas\
     文件      531361  2018-05-16 04:53  Chinese-Word-Split-master\datas\charsets.pkl
     文件        2880  2018-05-16 04:53  Chinese-Word-Split-master\datas\model.json
     文件        2879  2018-05-16 04:53  Chinese-Word-Split-master\datas\model_usebias.json
     文件     3061352  2018-05-16 04:53  Chinese-Word-Split-master\datas\model_weights.h5
     文件     3061352  2018-05-16 04:53  Chinese-Word-Split-master\datas\model_weights_usebias.h5
     文件    24389693  2018-05-16 04:53  Chinese-Word-Split-master\datas\msr_train.txt
     文件        2546  2018-05-16 04:53  Chinese-Word-Split-master\gen.py
     文件        1391  2018-05-16 04:53  Chinese-Word-Split-master\train.py

评论

共有 条评论