资源简介
基于朴素贝叶斯的垃圾邮件分类 对垃圾邮件的分类有较好的效果 达到99%
代码片段和文件信息
# -*- coding: utf-8 -*-
import numpy as np
def textParser(text):
“““
对SMS预处理,去除空字符串,并统一小写
:param text:
:return:
“““
import re
regEx = re.compile(r‘[^a-zA-Z]|\d‘) # 匹配非字母或者数字,即去掉非字母和数字,只留下单词
words = regEx.split(text)
# 去除空字符串,并统一小写
words = [word.lower() for word in words if len(word) > 0]
return words
def loadSMSData(fileName):
“““
加载SMS数据
:param fileName:
:return:
“““
f = open(fileName)
classCategory = [] # 类别标签,1表示是垃圾SMS,0表示正常SMS
smsWords = []
for line in f.readlines():
linedatas = line.strip().split(‘\t‘)
if linedatas[0] == ‘ham‘:
classCategory.append(0)
elif linedatas[0] == ‘spam‘:
classCategory.append(1)
# 切分文本
words = textParser(linedatas[1])
smsWords.append(words)
return smsWords classCategory
def createVocabularyList(smsWords):
“““
创建语料库
:param smsWords:
:return:
“““
vocabularySet = set([])
for words in smsWords:
vocabularySet = vocabularySet | set(words)
vocabularyList = list(vocabularySet)
return vocabularyList
def getVocabularyList(fileName):
“““
从词汇列表文件中获取语料库
:param fileName:
:return:
“““
fr = open(fileName)
vocabularyList = fr.readline().strip().split(‘\t‘)
fr.close()
return vocabularyList
def setOfWordsToVecTor(vocabularyList smsWords):
“““
SMS内容匹配预料库,标记预料库的词汇出现的次数
:param vocabularyList:
:param smsWords:
:return:
“““
vocabMarked = [0] * len(vocabularyList)
for smsWord in smsWords:
if smsWord in vocabularyList:
vocabMarked[vocabularyList.index(smsWord)] += 1
return vocabMarked
def setOfWordsListToVecTor(vocabularyList smsWordsList):
“““
将文本数据的二维数组标记
:param vocabularyList:
:param smsWordsList:
:return:
“““
vocabMarkedList = []
for i in range(len(smsWordsList)):
vocabMarked = setOfWordsToVecTor(vocabularyList smsWordsList[i])
vocabMarkedList.append(vocabMarked)
return vocabMarkedList
def trainingNaiveBayes(trainMarkedWords trainCategory):
“““
训练数据集中获取语料库中词汇的spamicity:P(Wi|S)
:param trainMarkedWords: 按照语料库标记的数据,二维数组
:param trainCategory:
:return:
“““
numTrainDoc = len(trainMarkedWords)
numWords = len(trainMarkedWords[0])
# 是垃圾邮件的先验概率P(S)
pSpam = sum(trainCategory) / float(numTrainDoc)
# 统计语料库中词汇在S和H中出现的次数
wordsInSpamNum = np.ones(numWords)
wordsInHealthNum = np.ones(numWords)
spamWordsNum = 2.0
healthWordsNum = 2.0
for i in range(0 numTrainDoc):
if trainCategory[i] == 1: # 如果是垃圾SMS或邮件
wordsInSpamNum += trainMarkedWords[i]
spamWordsNum += sum(trainMarkedWords[i]) # 统计Spam中语料库中词汇出现的总次数
else:
wordsInHealthNum += trainMarkedWords[i]
healthWo
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 5111 2018-05-07 16:36 SMS\NaiveBayes\NaiveBayes.py
文件 4842 2017-04-22 10:48 SMS\NaiveBayes\NaiveBayes.pyc
文件 14 2017-04-19 16:33 SMS\NaiveBayes\pSpam.txt
文件 210195 2017-04-19 16:33 SMS\NaiveBayes\pWordsHealthy.txt
文件 210195 2017-04-19 16:33 SMS\NaiveBayes\pWordsSpamicity.txt
文件 198723 2017-04-22 13:55 SMS\NaiveBayes\ROC Curve.png
文件 3876 2017-04-22 13:53 SMS\NaiveBayes\SenSpeciList0.csv
文件 3876 2017-04-22 13:53 SMS\NaiveBayes\SenSpeciList1.csv
文件 3876 2017-04-22 13:54 SMS\NaiveBayes\SenSpeciList2.csv
文件 3876 2017-04-22 13:54 SMS\NaiveBayes\SenSpeciList3.csv
文件 3952 2017-04-22 13:55 SMS\NaiveBayes\SenSpeciList4.csv
文件 477907 2011-03-15 22:36 SMS\NaiveBayes\SMSSpamCollection.txt
文件 3239 2018-05-07 16:36 SMS\NaiveBayes\test.py
文件 1942 2017-04-20 11:15 SMS\NaiveBayes\test.pyc
文件 802 2018-05-07 16:36 SMS\NaiveBayes\TestPlot.py
文件 1141 2018-05-07 16:36 SMS\NaiveBayes\training.py
文件 54677 2017-04-19 16:33 SMS\NaiveBayes\vocabularyList.txt
文件 58 2017-04-19 16:32 SMS\NaiveBayes\__init__.py
目录 0 2017-04-23 08:39 SMS\NaiveBayes
目录 0 2017-04-23 08:39 SMS
----------- --------- ---------- ----- ----
1188302 20
评论
共有 条评论