打开网站链接http://archive.ics.uci.edu/ml/,点击链接 view all data sets,打开所有数据页面,点击Instances,按照研究实例由多到少排序,选择任务为Classification的数据集,最后我们小组选择了“Letter Recognition Data Set”字母识别数据集。
from numpy import *
import string
#parse files function every data is a integer
def loadDataSet(filename):
numFeat = len(open(filename).readline().split(‘‘))
dataMat = []
fr = open(filename)
for line in fr.readlines():
lineArr= []
curLine = line.strip(‘\n‘).split(‘‘)
for i in range(1 numFeat):
return dataMat labelMat
purpose: data classify by compare to threshold
def stumpClassify(dataMatrix dimen threshVal threshIneq):
retArray = ones((shape(dataMatrix)[0]1))
if threshIneq == ‘lt‘:
retArray[dataMatrix[:dimen] else:
retArray[dataMatrix[:dimen] return retArray
purpose: single level decision tree create function(weak classify device)
input: dataArr: dataSet classLabels:class label D:data weight
output: bestStump: single level decision tree having min error rate minError: min Error rate
bestClassEst: estimate class labels
def buildStump(dataArrclassLabelsD):
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
mn = shape(dataMatrix)
numSteps = 10.0
# define a empty dictionary for store Dthe better single level tree info
bestStump = {}
bestClasEst = mat(zeros((m1)))
minError = inf #init error sum to +infinity
for i in range(n):#loop over all dimensions
rangeMin = dataMatrix[:i].min()
rangeMax = dataMatrix[:i].max()
stepSize = (rangeMax-rangeMin)/numSteps
for j in range(-1int(numSteps)+1):#loop over all range in current dimension
for inequal in [‘lt‘ ‘gt‘]: #go over less than and greater than
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMatrixithreshValinequal)#call stump classify with i j lessThan
errArr = mat(ones((m1))) # create error array
errArr[predictedVals == labelMat] = 0
weightedError = D.T*errArr #calc total error multiplied by D
#print “split: dim %d thresh %.2f thresh ineqal: %s the weighted error is %.3f“ % (i threshVal inequal weightedError)
if weightedError < minError: #if current error is smaller than before then save it into the beststump dictionary
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump[‘dim‘] = i
bestStump[‘thresh‘] = threshVal
bestStump[‘ineq‘] = inequal
return bestStumpminErrorbestClasEst
purpose:whole AdaBoost algorithm
input parameter:
dataArr:data set
classLabels:class labels
numIt:die dai number (only one parameter needed user to specified)
output parameter:
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 356180 2016-11-24 20:38 traindata.txt
文件 7150 2016-11-26 22:02 TreeAdaBoost.py
文件 36042 2017-03-18 09:31 文档.docx
文件 356383 2016-11-24 20:39 testdata.txt
----------- --------- ---------- ----- ----
755755 4
