资源简介
决策树算法(ID3和C45),两个算法分开写的,包含有数据集。
代码片段和文件信息
#-*-coding:utf-8-*-
from numpy import *
import math
import copy
import cPickle as pickle
class C45DTree(object):
def __init__(self):
self.tree={}
self.dataSet=[]
self.labels=[]
#数据导入函数
def loadDataSet(selfpathlabels):
recordlist=[]
fp=open(path“rb“)
content=fp.read()
fp.close()
rowlist=content.splitlines()
recordlist=[row.split(“\t“) for row in rowlist if row.strip()]
self.dataSet=recordlist
self.labels=labels
#执行决策树函数
def train(self):
labels=copy.deepcopy(self.labels)
self.tree=self.buildTree(self.dataSetlabels)
#构建决策树1111111111111111111111111111111111111111
def buildTree(selfdataSetlabels):
cateList=[data[-1] for data in dataSet]
if cateList.count(cateList[0])==len(cateList):
return cateList[0]
if len(dataSet[0])==1:
return self.maxCate(cateList)
#算法核心
bestFeatfeatValueList=self.getBestFeat(dataSet)
bestFeatLabel=labels[bestFeat]
tree={bestFeatLabel:{}}
del(labels[bestFeat])
for value in featValueList:
sublabels=labels[:]
#按最优特征列和值分割数据集
splitDataset=self.splitDataSet(dataSetbestFeatvalue)
subTree=self.buildTree(splitDatasetsublabels)
tree[bestFeatLabel][value]=subTree
return tree
#计算出现次数最多的类别标签
def maxCate(selfcatelist):
items=dict([(catelist.count(i)i) for i in catelist])
return items[max(items.keys())]
#计算最优特征11111111111111111111111111
def getBestFeat(selfdataSet):
Num_Feats=len(dataSet[0][:-1])
totality=len(dataSet)
baseEntropy=self.computeEntropy(dataSet)
ConditionEntroy=[]
slpitInfo=[]
allFeatVList=[]
for f in xrange(Num_Feats):
featList=[example[f] for example in dataSet]
[splitIfeatureValueList]=self.computeSplitInfo(featList)
allFeatVList.append(featureValueList)
slpitInfo.append(splitI)
resultGain=0.0
for value in featureValueList:
subSet=self.splitDataSet(dataSetfvalue)
appearNum=float(len(subSet))
subEntropy=self.computeEntropy(subSet)
resultGain+=(appearNum/totality)*subEntropy
ConditionEntroy.append(resultGain)
infoGainArray=baseEntropy*ones(Num_Feats)-array(ConditionEntroy)
infoGainRatio=infoGainArray/array(slpitInfo)
bestFeatureIndex=argsort(-infoGainRatio)[0]
return bestFeatureIndexallFeatVList[bestFeatureIndex]
#计算划分信息
def computeSplitInfo(selffeatureVList):
numEntries=len(featureVList)
featureVauleSetList=list(set(featureVList))
valueCounts=[featureVList.count(featVec) for featVec in featureVauleSetList]
pList=[float(item)/n
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 398 2018-04-22 20:45 决策树(ID3和C45)\C45DTree\.idea\C4.5DTree.iml
文件 213 2018-04-22 20:45 决策树(ID3和C45)\C45DTree\.idea\misc.xm
文件 782 2018-04-23 09:02 决策树(ID3和C45)\C45DTree\.idea\modules.xm
文件 30492 2018-04-23 10:05 决策树(ID3和C45)\C45DTree\.idea\workspace.xm
文件 5446 2018-04-23 09:21 决策树(ID3和C45)\C45DTree\C45DTree.py
文件 4963 2018-04-23 09:21 决策树(ID3和C45)\C45DTree\C45DTree.pyc
文件 807 2018-04-25 18:12 决策树(ID3和C45)\C45DTree\C45Test.py
文件 303 2018-04-22 21:22 决策树(ID3和C45)\C45DTree\C45Train.py
文件 195 2018-04-25 18:19 决策树(ID3和C45)\C45DTree\data.tree
文件 5066 2018-04-25 18:05 决策树(ID3和C45)\C45DTree\dataset.dat
文件 54 2018-04-23 09:26 决策树(ID3和C45)\C45DTree\test_data.dat
文件 455 2018-04-25 18:05 决策树(ID3和C45)\ID3DTree\.idea\ID3DTree.iml
文件 213 2018-04-22 19:58 决策树(ID3和C45)\ID3DTree\.idea\misc.xm
文件 405 2018-04-25 18:05 决策树(ID3和C45)\ID3DTree\.idea\modules.xm
文件 26132 2018-04-25 19:31 决策树(ID3和C45)\ID3DTree\.idea\workspace.xm
文件 195 2018-04-25 18:16 决策树(ID3和C45)\ID3DTree\data.tree
文件 5066 2018-04-25 18:04 决策树(ID3和C45)\ID3DTree\dataset.dat
文件 3887 2018-04-22 20:43 决策树(ID3和C45)\ID3DTree\ID3DTree.py
文件 4232 2018-04-22 20:43 决策树(ID3和C45)\ID3DTree\ID3DTree.pyc
文件 806 2018-04-25 18:16 决策树(ID3和C45)\ID3DTree\ID3Test.py
文件 304 2018-04-25 17:59 决策树(ID3和C45)\ID3DTree\ID3Train.py
目录 0 2018-05-01 21:36 决策树(ID3和C45)\C45DTree\.idea
目录 0 2018-05-01 21:36 决策树(ID3和C45)\ID3DTree\.idea
目录 0 2018-05-01 21:36 决策树(ID3和C45)\C45DTree
目录 0 2018-05-01 21:36 决策树(ID3和C45)\ID3DTree
目录 0 2018-05-01 21:36 决策树(ID3和C45)
----------- --------- ---------- ----- ----
90414 26
评论
共有 条评论