• 大小: 8KB
    文件类型: .rar
    金币: 1
    下载: 0 次
    发布日期: 2021-06-02
  • 语言: 其他
  • 标签: python  ID3  

资源简介

运用ID3算法训练决策树,运行成功,包含代码和训练测试数据集。

资源截图

代码片段和文件信息

#-*-coding:utf-8-*-
from numpy import *
import math
import copy
import cPickle as pickle

class ID3DTree(object):

    def __init__(self):
        self.tree={}
        self.dataSet=[]
        self.labels=[]
    #数据导入函数
    def loadDataSet(selfpathlabels):
        recordlist=[]
        fp=open(path“rb“)
        content=fp.read()
        fp.close()
        rowlist=content.splitlines()
        recordlist=[row.split(“\t“) for row in rowlist if row.strip()]
        self.dataSet=recordlist
        self.labels=labels
    #执行决策树函数
    def train(self):
            labels=copy.deepcopy(self.labels)
            self.tree=self.buildTree(self.dataSetlabels)

    #构建决策树
    def buildTree(selfdataSetlabels):
        cateList=[data[-1] for data in dataSet]
        if cateList.count(cateList[0])==len(cateList):
            return cateList[0]
        if len(dataSet[0])==1:
            return self.maxCate(cateList)
        #s算法核心
        bestFeat=self.getBestFeat(dataSet)
        bestFeatLabel=labels[bestFeat]
        tree={bestFeatLabel:{}}
        del(labels[bestFeat])
        #抽取最优特征轴的列向量
        uniqueVals=set([data[bestFeat] for data in dataSet])#去重
        for value in uniqueVals:
            sublabels=labels[:]
            #按最优特征列和值分割数据集
            splitDataset=self.splitDataSet(dataSetbestFeatvalue)
            subTree=self.buildTree(splitDatasetsublabels)
            tree[bestFeatLabel][value]=subTree
        return tree
    #计算出现次数最多的类别标签
    def maxCate(selfcatelist):
        items=dict([(catelist.count(i)i) for i in catelist])
        return items[max(items.keys())]
    #计算最优特征
    def getBestFeat(selfdataSet):
        numFeatures=len(dataSet[0])-1
        baseEntropy=self.computeEntropy(dataSet)
        bestInfoGain=0.0
        bestFeature=-1
        for i in xrange(numFeatures):
            uniqueVals=set([data[i] for data in dataSet])
            newEntropy=0.0
            for value in uniqueVals:
                subDataSet=self.splitDataSet(dataSetivalue)
                prob=len(subDataSet)/float(len(dataSet))
                newEntropy+=prob*self.computeEntropy(subDataSet)
            infoGain=baseEntropy-newEntropy
            if(infoGain>bestInfoGain):
                bestInfoGain=infoGain
                bestFeature=i
        return bestFeature
    #计算信息熵
    def computeEntropy(selfdataSet):
        datalen=float(len(dataSet))
        cateList=[data[-1] for data in dataSet]
        items=dict([(icateList.count(i)) for i in cateList])
        infoEntropy=0.0
        for key in items:
            prob=float(items[key])/datalen
            infoEntropy-=prob*math.log(prob2)
        return infoEntropy
    #划分数据集
    def splitDataSet(selfdataSetaxisvalue):
        rtnList=[]
        for featVec in dataSet:
            if featVec[axis]==value:
                rFeatVec=featVec[:axis]
                rFeatVec.extend(featVec[axis+1:])
             

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件        455  2018-04-25 18:05  ID3DTree\.idea\ID3DTree.iml

     文件        213  2018-04-22 19:58  ID3DTree\.idea\misc.xml

     文件        405  2018-04-25 18:05  ID3DTree\.idea\modules.xml

     文件      26132  2018-04-25 19:31  ID3DTree\.idea\workspace.xml

     文件        195  2018-04-25 18:16  ID3DTree\data.tree

     文件       5066  2018-04-25 18:04  ID3DTree\dataset.dat

     文件       3887  2018-04-22 20:43  ID3DTree\ID3DTree.py

     文件       4232  2018-04-22 20:43  ID3DTree\ID3DTree.pyc

     文件        806  2018-04-25 18:16  ID3DTree\ID3Test.py

     文件        304  2018-04-25 17:59  ID3DTree\ID3Train.py

     目录          0  2018-04-27 18:57  ID3DTree\.idea

     目录          0  2018-04-27 18:57  ID3DTree

----------- ---------  ---------- -----  ----

                41695                    12


评论

共有 条评论