keras实现中文文本分类

大小: 6KB

文件类型: .py

金币: 1

下载: 0 次

发布日期: 2023-08-07
语言: Python
标签: textCNN

高速下载

资源简介

keras实现中文文本分类；实现中文分析，词向量引入；基于语义的特征卷积计算，实现文本分类。

资源截图

小图大图

代码片段和文件信息

#-*-coding:utf-8-*-
import numpy as np
import jieba
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras

def load_data（file_path stop_words_path）: #‘E:/data/NLP/textteaser_new/textteaser/trainer/stopWords.txt‘
    # get stop_words
    with open（stop_words_path encoding=‘utf-8‘） as file:
        words = file.readlines（）
    stop_words = [word.replace（‘\n‘ ‘‘） for word in words]

    # get content and label
    data = [item.split（‘\t‘） for item in open（file_path ‘r‘ encoding=‘utf-8‘）.readlines（）]
    content = [item[3] for item in data]
    label = [item[1] for item in data]

    le = preprocessing.LabelEncoder（）
    ohe = preprocessing.OneHotEncoder（）
    label_le = [[item] for item in le.fit_transform（label）]
    label_ohe = ohe.fit_transform（label_le）.toarray（）

    # jieba cut and exclude stop_words
    content = [‘ ‘.join（[word for word in jieba.cut（item） if word not in stop_words]） for item in content]
    return [content label_ohe]

file_path = ‘E:/data/NLP/textteaser/test_data‘
stop_words_path = ‘E:/data/NLP/textteaser_new/textteaser/trainer/stopWords.txt‘
contents labels = load_data（file_path=file_path stop_words_path=stop_words_path）

# 关于词典的一些配置参数
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 200
embedDING_DIM = 128

tokenizer = Tokenizer（num_words=MAX_NB_WORDS）
tokenizer.fit_on_texts（texts=contents）
sequences = tokenizer.texts_to_sequences（texts=contents）

word_index = tokenizer.word_index

data = pad_sequences（sequences=sequences maxlen=MAX_SEQUENCE_LENGTH）

np.random.seed（101）
shuffled_index = np.random.permutation（np.arange（len（data）））
shuffled_data = data[shuffled_index]
shuffled_label = labels[shuffled_index]

print（shuffled_data.shape）
print（shuffled_label.shape）

# from keras.layers.core import Dense Dropout Flatten
# from keras.models import Model Input
# from keras.layers import MaxPooling1D embedding Conv1D
# # 构建训练网络
# inputs = Input（shape=（MAX_SEQUENCE_LENGTH））
# embedding_layer = embedding（len（word_index） + 1 embedDING_DIM input_length=MAX_SEQUENCE_LENGTH trainable=True）
# embedded_sequences = embedding_layer（input）
#
# # 卷积核类型1：strides = 3 filter_size = 128 256 512
# x_1 = Conv1D（128 3 activation=‘relu‘）（embedded_sequences）
# x_1 = MaxPooling1D（3 strides=2）（x_1）
# x_1 = Conv1D（256 3 activation=‘relu‘）（x_1）
# x_1 = MaxPooling1D（3 strides=2）（x_1）
# x_1 = Conv1D（512 3 activation=‘relu‘）（x_1）
# x_1 = MaxPooling1D（3 strides=2）（x_1）
# x_1 = Flatten（）（x_1）
# out_put = Dense（len（labels[0]） activation=‘softmax‘）（x_1）
#
# # # 卷积核类型2：strides = 4 filter_size = 128 256 512
# # x_2 = Conv1D（128 4 activation=‘relu‘）（embedded_sequences）
# # x_2 = MaxPooling1D（4 strides=2）（x_2）
# # x_2 = Conv1D（256 4 activation=‘relu‘）（x_2）
# # x_2 = MaxPooling1D（4 strides=2）（x_2）
# # x_2 = Conv1D（512 4 activation=‘relu‘）（x_2）
# # x_2 = MaxPooling

共有条评论

keras实现中文文本分类

资源简介

资源截图

代码片段和文件信息

评论

相关资源