资源简介
基于卷积神经网络处理中文文本分类
代码片段和文件信息
import numpy as np
import re
from sklearn.preprocessing import LabelBinarizer
from tensorflow.contrib import learn
import pdb
import collections
def clean_str(string):
“““
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
“““
string = re.sub(r“[^A-Za-z0-9()!?\‘\‘]“ “ “ string)
string = re.sub(r“\‘s“ “ \‘s“ string)
string = re.sub(r“\‘ve“ “ \‘ve“ string)
string = re.sub(r“n\‘t“ “ n\‘t“ string)
string = re.sub(r“\‘re“ “ \‘re“ string)
string = re.sub(r“\‘d“ “ \‘d“ string)
string = re.sub(r“\‘ll“ “ \‘ll“ string)
string = re.sub(r““ “ “ string)
string = re.sub(r“!“ “ ! “ string)
string = re.sub(r“\(“ “ \( “ string)
string = re.sub(r“\)“ “ \) “ string)
string = re.sub(r“\?“ “ \? “ string)
string = re.sub(r“\s{2}“ “ “ string)
return string.strip().lower()
def load_data_and_labels(positive_data_file negative_data_file):
“““
Loads MR polarity data from files splits the data into words and generates labels.
Returns split sentences and labels.
“““
# Load data from files
positive_examples = list(open(positive_data_file “r“).readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(negative_data_file “r“).readlines())
negative_examples = [s.strip() for s in negative_examples]
# Split by words
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
# Generate labels
positive_labels = [[0 1] for _ in positive_examples]
negative_labels = [[1 0] for _ in negative_examples]
y = np.concatenate([positive_labels negative_labels] 0)
return [x_text y]
def load_data_labels(data_file labels_file):
“““
Loads MR polarity data from files splits the data into words and generates labels.
Returns split sentences and labels.
“““
data = []
labels = []
with open(data_file ‘r‘ encoding=‘latin-1‘) as f:
data.extend([s.strip() for s in f.readlines()])
data = [clean_str(s) for s in data]
with open(labels_file ‘r‘) as f:
labels.extend([s.strip() for s in f.readlines()])
lables = [label.split(‘‘)[1].strip() for label in labels]
lb = LabelBinarizer()
y = lb.fit_transform(lables)
# max_document_length = max([len(x.split(“ “)) for x in data])
# print(max_document_length)
vocab_processor = learn.preprocessing.VocabularyProcessor(1000)
x = np.array(list(vocab_processor.fit_transform(data)))
return x y vocab_processor
def batch_iter(data batch_size num_epochs shuffle=True):
“““
Generates a batch iterator for a dataset.
“““
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-05-22 00:05 TextCNN-master\
文件 1052 2017-05-22 00:05 TextCNN-master\.gitignore
目录 0 2017-05-22 00:05 TextCNN-master\.idea\
文件 548 2017-05-22 00:05 TextCNN-master\.idea\TextCNN.iml
目录 0 2017-05-22 00:05 TextCNN-master\.idea\inspectionProfiles\
文件 228 2017-05-22 00:05 TextCNN-master\.idea\inspectionProfiles\profiles_settings.xm
文件 241 2017-05-22 00:05 TextCNN-master\.idea\misc.xm
文件 266 2017-05-22 00:05 TextCNN-master\.idea\modules.xm
文件 180 2017-05-22 00:05 TextCNN-master\.idea\vcs.xm
文件 37209 2017-05-22 00:05 TextCNN-master\.idea\workspace.xm
文件 7651 2017-05-22 00:05 TextCNN-master\LICENSE
文件 60 2017-05-22 00:05 TextCNN-master\README.md
文件 3593 2017-05-22 00:05 TextCNN-master\data_helpers.py
文件 4031 2017-05-22 00:05 TextCNN-master\eval.py
文件 3775 2017-05-22 00:05 TextCNN-master\text_cnn.py
文件 8210 2017-05-22 00:05 TextCNN-master\train.py
文件 8551 2017-05-22 00:05 TextCNN-master\train_debug.py
- 上一篇:ArduinoUSBKeyboard库文件
- 下一篇:大学物理实验思考题答案.txt
相关资源
- Faster-Rcnn-TF预训练模型
- 基于tensorflow的猫狗图片的识别分类
- keras做CNN的训练误差loss的下降操作
- 基于TensorFlow的CNN实现Mnist手写数字识
- 在Keras中CNN联合LSTM进行分类
- 批量处理labelme生成的json工具
- 分布式CNN代码
- windows+TensorFlow+mask R-CNN
- caffe下faster-rcnn的ResNet-50配置文件
- Faster R-CNN源代码
- VGGnet_fast_rcnn_iter_70000.ckpt
- 基于CNN的车牌识别字符模板
- Faster_RCNN绘制P-R曲线、检测视频
- 中文文本分类停用词表
- crowdcount-mcnn-master复现的预训练模型
- CNNIC数字证书工具1.2和rootsupd.exe.zip
- FasterRCNNTensorFlow源码理解.rar
- Cascade RCNN图解
- CNN唯一开源FPGA实现
- 论文研究 - 使用更快的RCNN Inception V
- An Energy-Efficient Reconfigurable Accelerator
- pynq-z2_boardfiles.zip
- Gated-SCNN_model.txt
- lstm-char-cnn 基于CNN的LSTM语言模型.zip
- MTCNN人脸检测模型
- pytorch实现人脸识别包括人脸检测(
- TensorFlow基于CIFAR10数据集的卷积神经网
- CNN训练人脸识别
- maskrcnn 用于细胞图像分割代码
- MTCNN人脸侦测项目代码-pytorch
评论
共有 条评论