python实现CNN中文文本分类

大小:

文件类型: .zip

金币: 1

下载: 0 次

发布日期: 2023-07-30
语言: Python
标签: CNN 文本分类 python 深度学习

高速下载

资源简介

python实现CNN中文文本分类

资源截图

小图大图

代码片段和文件信息

# encoding: UTF-8

import numpy as np
import re
import itertools
from collections import Counter
import os
import word2vec_helpers
import time
import pickle

def load_data_and_labels（input_text_file input_label_file num_labels）:
    x_text = read_and_clean_zh_file（input_text_file）
    y = None if not os.path.exists（input_label_file） else map（int list（open（input_label_file “r“）.readlines（）））
    return （x_text y）

def load_positive_negative_data_files（positive_data_file negative_data_file）:
    “““
    Loads MR polarity data from files splits the data into words and generates labels.
    Returns split sentences and labels.
    “““
    # Load data from files
    positive_examples = read_and_clean_zh_file（positive_data_file）
    negative_examples = read_and_clean_zh_file（negative_data_file）
    # Combine data
    x_text = positive_examples + negative_examples
    # Generate labels
    positive_labels = [[0 1] for _ in positive_examples]
    negative_labels = [[1 0] for _ in negative_examples]
    y = np.concatenate（[positive_labels negative_labels] 0）
    return [x_text y]

def padding_sentences（input_sentences padding_token padding_sentence_length = None）:
    sentences = [sentence.split（‘ ‘） for sentence in input_sentences]
    max_sentence_length = padding_sentence_length if padding_sentence_length is not None else max（[len（sentence） for sentence in sentences]）
    for sentence in sentences:
        if len（sentence） > max_sentence_length:
            sentence = sentence[:max_sentence_length]
        else:
            sentence.extend（[padding_token] * （max_sentence_length - len（sentence）））
    return （sentences max_sentence_length）

def batch_iter（data batch_size num_epochs shuffle=True）:
    ‘‘‘
    Generate a batch iterator for a dataset
    ‘‘‘
    data = np.array（data）
    data_size = len（data）
    num_batches_per_epoch = int（（data_size - 1） / batch_size） + 1
    for epoch in range（num_epochs）:
        if shuffle:
	    # Shuffle the data at each epoch
	    shuffle_indices = np.random.permutation（np.arange（data_size））
	    shuffled_data = data[shuffle_indices]
	else:
	    shuffled_data = data
	for batch_num in range（num_batches_per_epoch）:
	    start_idx = batch_num * batch_size
	    end_idx = min（（batch_num + 1） * batch_size data_size）
	    yield shuffled_data[start_idx : end_idx]

def test（）:
    # Test clean_str
    print（“Test“）
    #print（clean_str（“This‘s a huge dog! Who‘re going to the top.“））
    # Test load_positive_negative_data_files
    #x_texty = load_positive_negative_data_files（“./tiny_data/rt-polarity.pos“ “./tiny_data/rt-polarity.neg“）
    #print（x_text）
    #print（y）
    # Test batch_iter
    #batches = batch_iter（x_text 2 4）
    #for batch in batches:
    #    print（batch）

def mkdir_if_not_exist（dirpath）:
    if not os.path.exists（dirpath）:
        os.mkdir（dirpath）

def seperate_line（line）:
    return ‘‘.join（[word + ‘ ‘ for word in line]）

def read_and_clean_zh_file（input_file output_cleaned_file =

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2017-06-15 02:39  zh_cnn_text_classify-master\
     文件          14  2017-06-15 02:39  zh_cnn_text_classify-master\.gitignore
     文件        1917  2017-06-15 02:39  zh_cnn_text_classify-master\README.md
     目录           0  2017-06-15 02:39  zh_cnn_text_classify-master\data\
     文件       59489  2017-06-15 02:39  zh_cnn_text_classify-master\data\ham_100.utf8
     文件       44997  2017-06-15 02:39  zh_cnn_text_classify-master\data\spam_100.utf8
     文件        4504  2017-06-15 02:39  zh_cnn_text_classify-master\data_helpers.py
     文件        4870  2017-06-15 02:39  zh_cnn_text_classify-master\eval.py
     目录           0  2017-06-15 02:39  zh_cnn_text_classify-master\runs\
     目录           0  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\
     目录           0  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\
     文件         697  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\checkpoint
     文件     2373156  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-200.data-00000-of-00001
     文件        1009  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-200.index
     文件      102143  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-200.meta
     文件     2373156  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-300.data-00000-of-00001
     文件        1009  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-300.index
     文件      102143  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-300.meta
     文件     2373156  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-400.data-00000-of-00001
     文件        1009  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-400.index
     文件      102143  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-400.meta
     文件     2373156  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-500.data-00000-of-00001
     文件        1009  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-500.index
     文件      102143  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-500.meta
     文件     2373156  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-600.data-00000-of-00001
     文件        1009  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-600.index
     文件      102143  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\checkpoints\model-600.meta
     文件       46336  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\prediction.csv
     目录           0  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\summaries\
     目录           0  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\summaries\dev\
     文件      159244  2017-06-15 02:39  zh_cnn_text_classify-master\runs\1492954581\summaries\dev\events.out.tfevents.1492954586.escenter11PC
............此处省略7个文件信息

上一篇：Deep learning with Python Francois Chollet
下一篇：Francois Chollet-Deep Learning with Python252857

共有条评论

python实现CNN中文文本分类

资源简介

资源截图

代码片段和文件信息

评论

相关资源