资源简介
python实现CNN中文文本分类
代码片段和文件信息
# encoding: UTF-8
import numpy as np
import re
import itertools
from collections import Counter
import os
import word2vec_helpers
import time
import pickle
def load_data_and_labels(input_text_file input_label_file num_labels):
x_text = read_and_clean_zh_file(input_text_file)
y = None if not os.path.exists(input_label_file) else map(int list(open(input_label_file “r“).readlines()))
return (x_text y)
def load_positive_negative_data_files(positive_data_file negative_data_file):
“““
Loads MR polarity data from files splits the data into words and generates labels.
Returns split sentences and labels.
“““
# Load data from files
positive_examples = read_and_clean_zh_file(positive_data_file)
negative_examples = read_and_clean_zh_file(negative_data_file)
# Combine data
x_text = positive_examples + negative_examples
# Generate labels
positive_labels = [[0 1] for _ in positive_examples]
negative_labels = [[1 0] for _ in negative_examples]
y = np.concatenate([positive_labels negative_labels] 0)
return [x_text y]
def padding_sentences(input_sentences padding_token padding_sentence_length = None):
sentences = [sentence.split(‘ ‘) for sentence in input_sentences]
max_sentence_length = padding_sentence_length if padding_sentence_length is not None else max([len(sentence) for sentence in sentences])
for sentence in sentences:
if len(sentence) > max_sentence_length:
sentence = sentence[:max_sentence_length]
else:
sentence.extend([padding_token] * (max_sentence_length - len(sentence)))
return (sentences max_sentence_length)
def batch_iter(data batch_size num_epochs shuffle=True):
‘‘‘
Generate a batch iterator for a dataset
‘‘‘
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
for epoch in range(num_epochs):
if shuffle:
# Shuffle the data at each epoch
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_idx = batch_num * batch_size
end_idx = min((batch_num + 1) * batch_size data_size)
yield shuffled_data[start_idx : end_idx]
def test():
# Test clean_str
print(“Test“)
#print(clean_str(“This‘s a huge dog! Who‘re going to the top.“))
# Test load_positive_negative_data_files
#x_texty = load_positive_negative_data_files(“./tiny_data/rt-polarity.pos“ “./tiny_data/rt-polarity.neg“)
#print(x_text)
#print(y)
# Test batch_iter
#batches = batch_iter(x_text 2 4)
#for batch in batches:
# print(batch)
def mkdir_if_not_exist(dirpath):
if not os.path.exists(dirpath):
os.mkdir(dirpath)
def seperate_line(line):
return ‘‘.join([word + ‘ ‘ for word in line])
def read_and_clean_zh_file(input_file output_cleaned_file =
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master\
文件 14 2017-06-15 02:39 zh_cnn_text_classify-master\.gitignore
文件 1917 2017-06-15 02:39 zh_cnn_text_classify-master\README.md
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master\data\
文件 59489 2017-06-15 02:39 zh_cnn_text_classify-master\data\ham_100.utf8
文件 44997 2017-06-15 02:39 zh_cnn_text_classify-master\data\spam_100.utf8
文件 4504 2017-06-15 02:39 zh_cnn_text_classify-master\data_helpers.py
文件 4870 2017-06-15 02:39 zh_cnn_text_classify-master\eval.py
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master\runs\
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\
文件 697 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\checkpoint
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-200.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-200.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-200.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-300.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-300.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-300.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-400.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-400.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-400.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-500.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-500.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-500.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-600.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-600.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-600.me
文件 46336 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\prediction.csv
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\summaries\
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\summaries\dev\
文件 159244 2017-06-15 02:39 zh_cnn_text_classify-master\runs\1492954581\summaries\dev\events.out.tfevents.1492954586.escenter11PC
............此处省略7个文件信息
相关资源
- Deep learning with Python Francois Chollet
- 基于卷积神经网络的手势识别
- CNN用于图像分类以外的数字序列.rar
- DnCNN tensorflow实现
- Python-Tensorflow实现SpatialAsDeepSpatialCNN
- CNN+pythoncode8.18.zip
- 肺结节识别采用CNN
- NLP实战之fasttext进行THUCNews文本分类
- RNN python
- 最大熵的文本分类
- 车牌识别Tensorflow_CNN_python_opencv.zip
- 网易新闻数据,用于中文文本分类,
- 朴素贝叶斯算法实现的文本分类_Pyt
- cifar-10 90%+代码
- 朴素贝叶斯文本分类python实现(含数
- 深度学习testCNN的python实现
- labelme标注数据集到COCO格式数据集转化
- Python项目案例开发从入门到实战源代
- 指静脉识别,keras,CNN
- python3使用tensorflow构建CNN卷积神经网络
- CNN卷积神经网络python代码
- Attention-CNN(Jianlong-Fu 大神制作)
- 基于卷积神经网络的手写数字识别
- Tensorflow之CNN实现CIFAR-10图像的分类p
- cython_bbox.so
- 适合的新手-CNN代码
- 基于朴素贝叶斯实现的文本分类
- 文本分类算法LDA
- CNN网络识别Mnist的源码,有详细注释,
- Python实现朴素贝叶斯算法文本分类器
评论
共有 条评论