• 大小: 20KB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2021-05-23
  • 语言: Python
  • 标签: 机器学习  

资源简介

bayes.py为主体代码,利用终端输入python调用程序,代码中包含中文注释。也包含测试集与训练集。

资源截图

代码片段和文件信息

from math import log
from numpy import *
import operator

def loadDataSet():
    postingList = [[‘my‘‘dog‘‘has‘‘flea‘‘problems‘‘help‘‘please‘]
                   [‘maybe‘‘not‘‘take‘‘him‘‘to‘‘dog‘‘park‘‘stupid‘]
                   [‘my‘‘dalmation‘‘is‘‘so‘‘cute‘‘I‘‘love‘‘him‘]
                   [‘stop‘‘posting‘‘stupid‘‘worthless‘‘garbage‘]
                   [‘mr‘‘licks‘‘ate‘‘my‘‘steak‘‘how‘‘to‘‘stop‘‘him‘]
                   [‘quit‘‘buying‘‘worthless‘‘dog‘‘food‘‘stupid‘]]
    classVec = [010101]    #1代表侮辱性文字0代表正常言论
    return postingList classVec

#创建一个包含所有文档中出现的不重复单词列表
def createVocabList(dataSet):
    vocabSet = set([])    #创建空集合
    for document in dataSet:
        vocabSet = vocabSet | set(document)     #返回不重复的单词集合
        #print(vocabSet)
    return list(vocabSet)

def setOfWords2Vec(vocabList inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:print(“the word: %s is not in my Vocabulary!“ %word)
    return returnVec

#trainMatrix为输入的词条集合trainCategory为词条类别
def trainNB0(trainMatrix trainCategory):
    numTrainDocs = len(trainMatrix)     #获取词条长度即分母变量
    numWords = len(trainMatrix[0])     #第一段词条中单词个数即分子变量
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = zeros(numWords);p1Num = zeros(numWords)
    p0Denom = 0.0; p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vectp1VectpAbusive

#分类取概率高的值
def classifyNB(vec2Classify p0Vec p1Vec pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList postinDoc))
    p0Vp1VpAb = trainNB0(array(trainMat)array(listClasses))
    testEntry = [‘love‘‘my‘‘dalmation‘]
    thisDoc = array(setOfWords2Vec(myVocabList testEntry))
    print(testEntry ‘classified as: ‘ classifyNB(thisDoc p0V p1V pAb))
    testEntry = [‘stupid‘]
    thisDoc = array(setOfWords2Vec(myVocabList testEntry))
    print(testEntry ‘classified as: ‘ classifyNB(thisDoc p0V p1V pAb))

#与setOfWords2Vec不同的是在setOfWords2Vec函数中每个单词只能出现一次而在bagOfWords2Vec中每个单词可以出现多次
def bagOfWords2Vec(vocabList inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

#使用贝叶斯算法实现垃圾邮件过滤
#将一个大字符串解析为字符串列表
def textParse(bigString):
    import re
    listOfTokens = re.spli

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2017-12-14 21:14  bayes\
     文件        5313  2017-12-14 20:56  bayes\bayes.py
     目录           0  2011-12-20 11:36  bayes\email\
     目录           0  2017-12-14 16:46  bayes\email\ham\
     文件         148  2010-10-23 17:11  bayes\email\ham\1.txt
     文件          86  2017-12-14 16:44  bayes\email\ham\10.txt
     文件         130  2017-12-14 16:44  bayes\email\ham\11.txt
     文件         184  2017-12-14 16:45  bayes\email\ham\12.txt
     文件         174  2010-10-23 17:13  bayes\email\ham\13.txt
     文件         172  2017-12-14 16:45  bayes\email\ham\14.txt
     文件         531  2017-12-14 16:37  bayes\email\ham\15.txt
     文件          91  2017-12-14 16:45  bayes\email\ham\16.txt
     文件         466  2017-12-14 16:45  bayes\email\ham\17.txt
     文件         177  2017-12-14 16:45  bayes\email\ham\18.txt
     文件         161  2017-12-14 16:46  bayes\email\ham\19.txt
     文件         239  2017-12-14 16:42  bayes\email\ham\2.txt
     文件         208  2010-10-23 09:26  bayes\email\ham\20.txt
     文件         236  2017-12-14 16:46  bayes\email\ham\21.txt
     文件         332  2017-12-14 16:46  bayes\email\ham\22.txt
     文件         607  2017-12-14 16:46  bayes\email\ham\23.txt
     文件          42  2010-10-23 09:33  bayes\email\ham\24.txt
     文件          89  2010-10-23 09:34  bayes\email\ham\25.txt
     文件         373  2017-12-14 16:43  bayes\email\ham\3.txt
     文件         213  2017-12-14 16:43  bayes\email\ham\4.txt
     文件         114  2010-10-23 17:11  bayes\email\ham\5.txt
     文件        1467  2017-12-14 15:44  bayes\email\ham\6.txt
     文件         109  2010-10-23 17:12  bayes\email\ham\7.txt
     文件         638  2010-10-23 08:58  bayes\email\ham\8.txt
     文件         148  2017-12-14 16:44  bayes\email\ham\9.txt
     目录           0  2017-12-14 16:40  bayes\email\spam\
     文件         238  2010-10-23 08:28  bayes\email\spam\1.txt
............此处省略24个文件信息

评论

共有 条评论