• 大小: 352KB
    文件类型: .rar
    金币: 1
    下载: 0 次
    发布日期: 2021-05-15
  • 语言: 其他
  • 标签:

资源简介

基于朴素贝叶斯的垃圾邮件分类 对垃圾邮件的分类有较好的效果 达到99%

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-

import numpy as np

def textParser(text):
    “““
    对SMS预处理,去除空字符串,并统一小写
    :param text:
    :return:
    “““
    import re
    regEx = re.compile(r‘[^a-zA-Z]|\d‘)  # 匹配非字母或者数字,即去掉非字母和数字,只留下单词
    words = regEx.split(text)
    # 去除空字符串,并统一小写
    words = [word.lower() for word in words if len(word) > 0]
    return words


def loadSMSData(fileName):
    “““
    加载SMS数据
    :param fileName:
    :return:
    “““
    f = open(fileName)
    classCategory = []  # 类别标签,1表示是垃圾SMS,0表示正常SMS
    smsWords = []
    for line in f.readlines():
        linedatas = line.strip().split(‘\t‘)
        if linedatas[0] == ‘ham‘:
            classCategory.append(0)
        elif linedatas[0] == ‘spam‘:
            classCategory.append(1)
        # 切分文本
        words = textParser(linedatas[1])
        smsWords.append(words)
    return smsWords classCategory


def createVocabularyList(smsWords):
    “““
    创建语料库
    :param smsWords:
    :return:
    “““
    vocabularySet = set([])
    for words in smsWords:
        vocabularySet = vocabularySet | set(words)
    vocabularyList = list(vocabularySet)
    return vocabularyList


def getVocabularyList(fileName):
    “““
    从词汇列表文件中获取语料库
    :param fileName:
    :return:
    “““
    fr = open(fileName)
    vocabularyList = fr.readline().strip().split(‘\t‘)
    fr.close()
    return vocabularyList


def setOfWordsToVecTor(vocabularyList smsWords):
    “““
    SMS内容匹配预料库,标记预料库的词汇出现的次数
    :param vocabularyList:
    :param smsWords:
    :return:
    “““
    vocabMarked = [0] * len(vocabularyList)
    for smsWord in smsWords:
        if smsWord in vocabularyList:
            vocabMarked[vocabularyList.index(smsWord)] += 1
    return vocabMarked


def setOfWordsListToVecTor(vocabularyList smsWordsList):
    “““
    将文本数据的二维数组标记
    :param vocabularyList:
    :param smsWordsList:
    :return:
    “““
    vocabMarkedList = []
    for i in range(len(smsWordsList)):
        vocabMarked = setOfWordsToVecTor(vocabularyList smsWordsList[i])
        vocabMarkedList.append(vocabMarked)
    return vocabMarkedList


def trainingNaiveBayes(trainMarkedWords trainCategory):
    “““
    训练数据集中获取语料库中词汇的spamicity:P(Wi|S)
    :param trainMarkedWords: 按照语料库标记的数据,二维数组
    :param trainCategory:
    :return:
    “““
    numTrainDoc = len(trainMarkedWords)
    numWords = len(trainMarkedWords[0])
    # 是垃圾邮件的先验概率P(S)
    pSpam = sum(trainCategory) / float(numTrainDoc)

    # 统计语料库中词汇在S和H中出现的次数
    wordsInSpamNum = np.ones(numWords)
    wordsInHealthNum = np.ones(numWords)
    spamWordsNum = 2.0
    healthWordsNum = 2.0
    for i in range(0 numTrainDoc):
        if trainCategory[i] == 1:  # 如果是垃圾SMS或邮件
            wordsInSpamNum += trainMarkedWords[i]
            spamWordsNum += sum(trainMarkedWords[i])  # 统计Spam中语料库中词汇出现的总次数
        else:
            wordsInHealthNum += trainMarkedWords[i]
            healthWo

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件       5111  2018-05-07 16:36  SMS\NaiveBayes\NaiveBayes.py

     文件       4842  2017-04-22 10:48  SMS\NaiveBayes\NaiveBayes.pyc

     文件         14  2017-04-19 16:33  SMS\NaiveBayes\pSpam.txt

     文件     210195  2017-04-19 16:33  SMS\NaiveBayes\pWordsHealthy.txt

     文件     210195  2017-04-19 16:33  SMS\NaiveBayes\pWordsSpamicity.txt

     文件     198723  2017-04-22 13:55  SMS\NaiveBayes\ROC Curve.png

     文件       3876  2017-04-22 13:53  SMS\NaiveBayes\SenSpeciList0.csv

     文件       3876  2017-04-22 13:53  SMS\NaiveBayes\SenSpeciList1.csv

     文件       3876  2017-04-22 13:54  SMS\NaiveBayes\SenSpeciList2.csv

     文件       3876  2017-04-22 13:54  SMS\NaiveBayes\SenSpeciList3.csv

     文件       3952  2017-04-22 13:55  SMS\NaiveBayes\SenSpeciList4.csv

     文件     477907  2011-03-15 22:36  SMS\NaiveBayes\SMSSpamCollection.txt

     文件       3239  2018-05-07 16:36  SMS\NaiveBayes\test.py

     文件       1942  2017-04-20 11:15  SMS\NaiveBayes\test.pyc

     文件        802  2018-05-07 16:36  SMS\NaiveBayes\TestPlot.py

     文件       1141  2018-05-07 16:36  SMS\NaiveBayes\training.py

     文件      54677  2017-04-19 16:33  SMS\NaiveBayes\vocabularyList.txt

     文件         58  2017-04-19 16:32  SMS\NaiveBayes\__init__.py

     目录          0  2017-04-23 08:39  SMS\NaiveBayes

     目录          0  2017-04-23 08:39  SMS

----------- ---------  ---------- -----  ----

              1188302                    20


评论

共有 条评论