资源简介
这个是Python编写的一个情感文本分析程序,定义两种term weight实现,分别为TF 和BOOL,实现了特征选择算法。文件夹中附带数据集
代码片段和文件信息
# coding=gbk
import re
import numpy as np
from numpy import *
#################################文本处理############################################
def testTextParse(filename):
text = open(filename).read()
pattern = ‘(.*?) ‘
str_list = re.findall(pattern text re.S)
doc_list = []
ptn = re.compile(‘\\s*‘)
for doc in str_list:
doc = ptn.split(doc)
doc_list.append([term for term in doc if len(term)>=1 and term != ‘‘and term != ‘.‘and term != ‘!‘and term != ‘?‘and term != ‘(‘and term != ‘)‘
and term != ‘\“‘and term != ‘\‘‘
and term != ‘\xa1\xa3‘ and term != ‘\xa3\xac‘ and term != ‘\xa3\xbf‘and term != ‘\xa3\xa1‘and term != ‘\xa3\xbb‘
and term != ‘\xa3\xba‘and term != ‘\xa1\xb0‘and term != ‘\xa1\xb1‘and term != ‘\xa1\xae‘and term != ‘\xa1\xaf‘
and term != ‘\xa3\xa8‘and term != ‘\xa3\xa9‘and term != ‘\xa1\xa2‘
])
return doc_list
def cvTextParse(filenamestartend): #用于交叉验证的文档解析
text = open(filename).read()
pattern = ‘(.*?) ‘
str_list = re.findall(pattern text re.S)
doc_list = []
start_index = 0
end_index = start
ptn = re.compile(‘\\s*‘)
for doc in str_list:
start_index +=1
if start_index >= start:
end_index +=1
if end_index <= end:
doc = ptn.split(doc)
doc_list.append([term for term in doc if len(term)>=1 and term != ‘‘and term != ‘.‘and term != ‘!‘and term != ‘?‘and term != ‘(‘and term != ‘)‘
and term != ‘\“‘and term != ‘\‘‘
and term != ‘\xa1\xa3‘ and term != ‘\xa3\xac‘ and term != ‘\xa3\xbf‘and term != ‘\xa3\xa1‘and term != ‘\xa3\xbb‘
and term != ‘\xa3\xba‘and term != ‘\xa1\xb0‘and term != ‘\xa1\xb1‘and term != ‘\xa1\xae‘and term != ‘\xa1\xaf‘
and term != ‘\xa3\xa8‘and term != ‘\xa3\xa9‘and term != ‘\xa1\xa2‘
])
return doc_list
def outputTextParse(filename):
text = open(filename).read()
ptn = re.compile(‘| |\\s*‘)
outputText = ptn.sub(‘‘text)
return outputText
##############################类别向量生成#############################################
def gen_class_list_n(k):
class_list = []
for i in range(k):
class_list.append(0) #生成否定性评论类别列表
return class_list
def gen_class_list_p(k):
class_list = []
for i in range(k):
class_list.append(1) #生成肯定性评论类别列表
return class_list
##############################词条向量生成#############################################
def createTermSet(doc_list): #返回文档中出现的所有词组成的词条集合
termSet = set([])
for doc in doc_list:
termSet = termSet | set(doc)
return list(termSet)
def saveTermSet(te
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2014-01-10 11:07 朴素贝叶斯文本分类\
目录 0 2014-01-10 11:07 朴素贝叶斯文本分类\dataset\
文件 934320 2013-12-09 22:09 朴素贝叶斯文本分类\dataset\negative.txt
文件 600153 2013-12-09 22:09 朴素贝叶斯文本分类\dataset\positive.txt
文件 20 2013-12-13 20:39 朴素贝叶斯文本分类\dataset\testDataset0.txt
文件 99 2013-12-17 10:00 朴素贝叶斯文本分类\dataset\testDataset1.txt
文件 15973 2013-12-18 22:05 朴素贝叶斯文本分类\nbayes.py
文件 87 2013-12-18 13:27 朴素贝叶斯文本分类\nbayes_calssify.py
文件 108 2013-12-18 09:43 朴素贝叶斯文本分类\nbayes_validate.py
文件 457 2013-12-18 16:26 朴素贝叶斯文本分类\代码说明.txt
评论
共有 条评论