资源简介
朴素贝叶斯文本分类python实现(含数据集)
代码片段和文件信息
# -*- coding: utf-8 -*-
“““
Created on Mon Oct 30 21:39:21 2017
@author: Q
“““
import numpy as np
import re
import feedparser
import operator
def loadDataSet():
postingList=[[‘my‘ ‘dog‘ ‘has‘ ‘flea‘ ‘problems‘ ‘help‘ ‘please‘]
[‘maybe‘ ‘not‘ ‘take‘ ‘him‘ ‘to‘ ‘dog‘ ‘park‘ ‘stupid‘]
[‘my‘ ‘dalmation‘ ‘is‘ ‘so‘ ‘cute‘ ‘I‘ ‘love‘ ‘him‘]
[‘stop‘ ‘posting‘ ‘stupid‘ ‘worthless‘ ‘garbage‘]
[‘mr‘ ‘licks‘ ‘ate‘ ‘my‘ ‘steak‘ ‘how‘ ‘to‘ ‘stop‘ ‘him‘]
[‘quit‘ ‘buying‘ ‘worthless‘ ‘dog‘ ‘food‘ ‘stupid‘]]
classVec = [010101] #1 is abusive 0 not
return postingListclassVec
def createVocabList(data): #创建词向量
returnList = set([])
for subdata in data:
returnList = returnList | set(subdata)
return list(returnList)
def setofWords2Vec(vocabListdata): #将文本转化为词条
returnList = [0]*len(vocabList)
for vocab in data:
if vocab in vocabList:
returnList[vocabList.index(vocab)] += 1
return returnList
def trainNB0(trainMatrixtrainCategory): #训练,得到分类概率
pAbusive = sum(trainCategory)/len(trainCategory)
p1num = np.ones(len(trainMatrix[0]))
p0num = np.ones(len(trainMatrix[0]))
p1Denom = 2
p0Denom = 2
for i in range(len(trainCategory)):
if trainCategory[i] == 1:
p1num = p1num + trainMatrix[i]
p1Denom = p1Denom + sum(trainMatrix[i])
else:
p0num = p0num + trainMatrix[i]
p0Denom = p0Denom + sum(trainMatrix[i])
p1Vect = np.log(p1num/p1Denom)
p0Vect = np.log(p0num/p0Denom)
return p0Vectp1VectpAbusive
def classifyNB(vec2Classifyp0Vecp1VecpClass1): #分类
p0 = sum(vec2Classify*p0Vec)+np.log(1-pClass1)
p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
if p1 > p0:
return 1
else:
return 0
def textParse(bigString): #文本解析
splitdata = re.split(r‘\W+‘bigString)
splitdata = [token.lower() for token in splitdata if len(token) > 2]
return splitdata
def spamTest():
docList = []
classList = []
for i in range(126):
with open(‘spam/%d.txt‘%i) as f:
doc = f.read()
docList.append(doc)
classList.append(1)
with open(‘ham/%d.txt‘%i) as f:
doc = f.read()
docList.append(doc)
classList.append(0)
vocalList = createVocabList(docList)
trainList = list(range(50))
testList = []
for i in range(13):
num = int(np.random.uniform(0len(docList))-10)
testList.append(trainList[num])
del(trainList[num])
docMatrix = []
docClass = []
for i in trainList:
subVec = setofWords2Vec(vocalListdocList[i])
docMatrix.append(subVec)
docClass.append(classList[i])
p0vp1vpAb = trainNB0(docMatrixdocClass
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 6 2017-12-03 20:35 beyes\.git\COMMIT_EDITMSG
文件 297 2017-12-03 20:36 beyes\.git\config
文件 73 2017-12-03 20:35 beyes\.git\desc
文件 23 2017-12-03 20:35 beyes\.git\HEAD
文件 478 2017-12-03 20:35 beyes\.git\hooks\applypatch-msg.sample
文件 896 2017-12-03 20:35 beyes\.git\hooks\commit-msg.sample
文件 189 2017-12-03 20:35 beyes\.git\hooks\post-update.sample
文件 424 2017-12-03 20:35 beyes\.git\hooks\pre-applypatch.sample
文件 1642 2017-12-03 20:35 beyes\.git\hooks\pre-commit.sample
文件 1348 2017-12-03 20:35 beyes\.git\hooks\pre-push.sample
文件 4898 2017-12-03 20:35 beyes\.git\hooks\pre-reba
文件 544 2017-12-03 20:35 beyes\.git\hooks\pre-receive.sample
文件 1239 2017-12-03 20:35 beyes\.git\hooks\prepare-commit-msg.sample
文件 3610 2017-12-03 20:35 beyes\.git\hooks\update.sample
文件 4125 2017-12-03 20:35 beyes\.git\index
文件 240 2017-12-03 20:35 beyes\.git\info\exclude
文件 152 2017-12-03 20:35 beyes\.git\logs\HEAD
文件 152 2017-12-03 20:35 beyes\.git\logs\refs\heads\master
文件 143 2017-12-03 20:36 beyes\.git\logs\refs\remotes\origin\master
文件 275 2017-12-03 20:35 beyes\.git\ob
文件 133 2017-12-03 20:35 beyes\.git\ob
文件 227 2017-12-03 20:35 beyes\.git\ob
文件 484 2017-12-03 20:35 beyes\.git\ob
文件 146 2017-12-03 20:35 beyes\.git\ob
文件 89 2017-12-03 20:35 beyes\.git\ob
文件 110 2017-12-03 20:35 beyes\.git\ob
文件 199 2017-12-03 20:35 beyes\.git\ob
文件 200 2017-12-03 20:35 beyes\.git\ob
文件 188 2017-12-03 20:35 beyes\.git\ob
文件 200 2017-12-03 20:35 beyes\.git\ob
............此处省略149个文件信息
评论
共有 条评论