资源简介
keras实现中文文本分类;实现中文分析,词向量引入;基于语义的特征卷积计算,实现文本分类。
代码片段和文件信息
#-*-coding:utf-8-*-
import numpy as np
import jieba
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
def load_data(file_path stop_words_path): #‘E:/data/NLP/textteaser_new/textteaser/trainer/stopWords.txt‘
# get stop_words
with open(stop_words_path encoding=‘utf-8‘) as file:
words = file.readlines()
stop_words = [word.replace(‘\n‘ ‘‘) for word in words]
# get content and label
data = [item.split(‘\t‘) for item in open(file_path ‘r‘ encoding=‘utf-8‘).readlines()]
content = [item[3] for item in data]
label = [item[1] for item in data]
le = preprocessing.LabelEncoder()
ohe = preprocessing.OneHotEncoder()
label_le = [[item] for item in le.fit_transform(label)]
label_ohe = ohe.fit_transform(label_le).toarray()
# jieba cut and exclude stop_words
content = [‘ ‘.join([word for word in jieba.cut(item) if word not in stop_words]) for item in content]
return [content label_ohe]
file_path = ‘E:/data/NLP/textteaser/test_data‘
stop_words_path = ‘E:/data/NLP/textteaser_new/textteaser/trainer/stopWords.txt‘
contents labels = load_data(file_path=file_path stop_words_path=stop_words_path)
# 关于词典的一些配置参数
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 200
embedDING_DIM = 128
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts=contents)
sequences = tokenizer.texts_to_sequences(texts=contents)
word_index = tokenizer.word_index
data = pad_sequences(sequences=sequences maxlen=MAX_SEQUENCE_LENGTH)
np.random.seed(101)
shuffled_index = np.random.permutation(np.arange(len(data)))
shuffled_data = data[shuffled_index]
shuffled_label = labels[shuffled_index]
print(shuffled_data.shape)
print(shuffled_label.shape)
# from keras.layers.core import Dense Dropout Flatten
# from keras.models import Model Input
# from keras.layers import MaxPooling1D embedding Conv1D
# # 构建训练网络
# inputs = Input(shape=(MAX_SEQUENCE_LENGTH))
# embedding_layer = embedding(len(word_index) + 1 embedDING_DIM input_length=MAX_SEQUENCE_LENGTH trainable=True)
# embedded_sequences = embedding_layer(input)
#
# # 卷积核类型1:strides = 3 filter_size = 128 256 512
# x_1 = Conv1D(128 3 activation=‘relu‘)(embedded_sequences)
# x_1 = MaxPooling1D(3 strides=2)(x_1)
# x_1 = Conv1D(256 3 activation=‘relu‘)(x_1)
# x_1 = MaxPooling1D(3 strides=2)(x_1)
# x_1 = Conv1D(512 3 activation=‘relu‘)(x_1)
# x_1 = MaxPooling1D(3 strides=2)(x_1)
# x_1 = Flatten()(x_1)
# out_put = Dense(len(labels[0]) activation=‘softmax‘)(x_1)
#
# # # 卷积核类型2:strides = 4 filter_size = 128 256 512
# # x_2 = Conv1D(128 4 activation=‘relu‘)(embedded_sequences)
# # x_2 = MaxPooling1D(4 strides=2)(x_2)
# # x_2 = Conv1D(256 4 activation=‘relu‘)(x_2)
# # x_2 = MaxPooling1D(4 strides=2)(x_2)
# # x_2 = Conv1D(512 4 activation=‘relu‘)(x_2)
# # x_2 = MaxPooling
评论
共有 条评论