• 大小:
    文件类型: .rar
    金币: 2
    下载: 1 次
    发布日期: 2021-06-17
  • 语言: Python
  • 标签:

资源简介

朴素贝叶斯文本分类python实现(含数据集)

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-
“““
Created on Mon Oct 30 21:39:21 2017

@author: Q
“““
import numpy as np
import re
import feedparser
import operator
def loadDataSet():
    postingList=[[‘my‘ ‘dog‘ ‘has‘ ‘flea‘ ‘problems‘ ‘help‘ ‘please‘]
                 [‘maybe‘ ‘not‘ ‘take‘ ‘him‘ ‘to‘ ‘dog‘ ‘park‘ ‘stupid‘]
                 [‘my‘ ‘dalmation‘ ‘is‘ ‘so‘ ‘cute‘ ‘I‘ ‘love‘ ‘him‘]
                 [‘stop‘ ‘posting‘ ‘stupid‘ ‘worthless‘ ‘garbage‘]
                 [‘mr‘ ‘licks‘ ‘ate‘ ‘my‘ ‘steak‘ ‘how‘ ‘to‘ ‘stop‘ ‘him‘]
                 [‘quit‘ ‘buying‘ ‘worthless‘ ‘dog‘ ‘food‘ ‘stupid‘]]
    classVec = [010101]    #1 is abusive 0 not
    return postingListclassVec

def createVocabList(data):    #创建词向量
    returnList = set([])
    for subdata in data:
        returnList = returnList | set(subdata)
    return list(returnList)
    

def setofWords2Vec(vocabListdata):      #将文本转化为词条

    returnList = [0]*len(vocabList)
    for vocab in data:
        if vocab in vocabList:
            returnList[vocabList.index(vocab)] += 1
    return returnList

    
def trainNB0(trainMatrixtrainCategory):        #训练,得到分类概率
    pAbusive = sum(trainCategory)/len(trainCategory)
    p1num = np.ones(len(trainMatrix[0]))
    p0num = np.ones(len(trainMatrix[0]))
    p1Denom = 2
    p0Denom = 2
    for i in range(len(trainCategory)):
        if trainCategory[i] == 1:
            p1num = p1num + trainMatrix[i]
            p1Denom = p1Denom + sum(trainMatrix[i])
        else:
            p0num = p0num + trainMatrix[i]
            p0Denom = p0Denom + sum(trainMatrix[i])
    p1Vect = np.log(p1num/p1Denom)
    p0Vect = np.log(p0num/p0Denom)
    return p0Vectp1VectpAbusive

    
def  classifyNB(vec2Classifyp0Vecp1VecpClass1):    #分类
    p0 = sum(vec2Classify*p0Vec)+np.log(1-pClass1)
    p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
    if p1 > p0:
        return 1
    else:
        return 0
def textParse(bigString):          #文本解析
    splitdata = re.split(r‘\W+‘bigString)
    splitdata = [token.lower() for token in splitdata if len(token) > 2]
    return splitdata
def spamTest():
    docList = []
    classList = []
    for i in range(126):
        with open(‘spam/%d.txt‘%i) as f:
            doc = f.read()
        docList.append(doc)
        classList.append(1)
        with open(‘ham/%d.txt‘%i) as f:
            doc = f.read()
        docList.append(doc)
        classList.append(0)
    vocalList = createVocabList(docList)
    trainList = list(range(50))
    testList = []
    for i in range(13):
        num = int(np.random.uniform(0len(docList))-10)
        testList.append(trainList[num])
        del(trainList[num])
    docMatrix = []
    docClass = []
    for i in trainList:
        subVec = setofWords2Vec(vocalListdocList[i])
        docMatrix.append(subVec)
        docClass.append(classList[i])
    p0vp1vpAb = trainNB0(docMatrixdocClass

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件          6  2017-12-03 20:35  beyes\.git\COMMIT_EDITMSG

     文件        297  2017-12-03 20:36  beyes\.git\config

     文件         73  2017-12-03 20:35  beyes\.git\description

     文件         23  2017-12-03 20:35  beyes\.git\HEAD

     文件        478  2017-12-03 20:35  beyes\.git\hooks\applypatch-msg.sample

     文件        896  2017-12-03 20:35  beyes\.git\hooks\commit-msg.sample

     文件        189  2017-12-03 20:35  beyes\.git\hooks\post-update.sample

     文件        424  2017-12-03 20:35  beyes\.git\hooks\pre-applypatch.sample

     文件       1642  2017-12-03 20:35  beyes\.git\hooks\pre-commit.sample

     文件       1348  2017-12-03 20:35  beyes\.git\hooks\pre-push.sample

     文件       4898  2017-12-03 20:35  beyes\.git\hooks\pre-rebase.sample

     文件        544  2017-12-03 20:35  beyes\.git\hooks\pre-receive.sample

     文件       1239  2017-12-03 20:35  beyes\.git\hooks\prepare-commit-msg.sample

     文件       3610  2017-12-03 20:35  beyes\.git\hooks\update.sample

     文件       4125  2017-12-03 20:35  beyes\.git\index

     文件        240  2017-12-03 20:35  beyes\.git\info\exclude

     文件        152  2017-12-03 20:35  beyes\.git\logs\HEAD

     文件        152  2017-12-03 20:35  beyes\.git\logs\refs\heads\master

     文件        143  2017-12-03 20:36  beyes\.git\logs\refs\remotes\origin\master

     文件        275  2017-12-03 20:35  beyes\.git\objects\00\ddace8c6d9a62e6af0134df6cfa2c358c56280

     文件        133  2017-12-03 20:35  beyes\.git\objects\0b\05545123e9fe68fdd46f3bf1040905d80a2379

     文件        227  2017-12-03 20:35  beyes\.git\objects\0b\8282cc60cbc9cca9efd66f1216e33f4ed1b255

     文件        484  2017-12-03 20:35  beyes\.git\objects\0c\b7e5cd9430ecfaa795ebae52a7861bedd59b7f

     文件        146  2017-12-03 20:35  beyes\.git\objects\13\936b1904437986f04ec9f7bcadeb9fcb1ed47d

     文件         89  2017-12-03 20:35  beyes\.git\objects\19\80ca62e3b3d958c8763d25b7cdd537873bfa15

     文件        110  2017-12-03 20:35  beyes\.git\objects\1d\916ab47c9378a167a18904feaf412d1eaa7093

     文件        199  2017-12-03 20:35  beyes\.git\objects\22\6e8cc1f7bbfbdbc59695f4f7027ac41196f716

     文件        200  2017-12-03 20:35  beyes\.git\objects\22\fe388cfced1ddc5889cdd58d2092447de58cd4

     文件        188  2017-12-03 20:35  beyes\.git\objects\23\af9ba24a111c8d3cd89d02bc4a12bb0042c39c

     文件        200  2017-12-03 20:35  beyes\.git\objects\25\79c6d388d3e30cb73566d8a1b238d0667b3ce9

............此处省略149个文件信息

评论

共有 条评论