资源简介

决策树代码实现,参考机器学习实战,数据集采用的是adult数据集,增加了数据清洗,该决策树是随机实现的,增加了过拟合的剪枝。

资源截图

代码片段和文件信息

import numpy as np
import pandas as pd
import csv
import treePlotter
import matplotlib.pyplot as plt


#discretization function
def discretization(filename):
    data=pd.read_csv(filename)
    ##
    dataAge=data[‘age‘]
    k=4
    data[‘age‘]=pd.cut(dataAgeklabels=range(k))
    ##
    dataFnlwgt=data[‘fnlwgt‘]
    k=10
    data[‘fnlwgt‘]=pd.cut(dataFnlwgtklabels=range(k))
    ##
    dataEducationNum=data[‘education-num‘]
    k=16
    data[‘education-num‘]=pd.cut(dataEducationNumklabels=range(k))
    ##
    dataCapitalGain=data[‘capital-gain‘]
    k=3
    data[‘capital-gain‘]=pd.cut(dataCapitalGainklabels=range(k))
    ##
    dataCapitalLoss=data[‘capital-loss‘]
    k=3
    data[‘capital-loss‘]=pd.cut(dataCapitalLossklabels=range(k))
    ##
    dataHoursPerWeek=data[‘hours-per-week‘]
    k=3
    data[‘hours-per-week‘]=pd.cut(dataHoursPerWeekklabels=range(k))
    return data

if __name__ == “__main__“:
    #data preprocess
    #remove the unknown values
    names = (“age workclass fnlwgt education education-num “
             “marital-status occupation relationship race sex “
             “capital-gain capital-loss hours-per-week “
             “native-country income“).split(‘ ‘)
    #adult_data
    data = pd.read_csv(‘./adult.data‘names = names)
    stripstr = lambda x: x.strip() if isinstance(x str) else x
    data = data.applymap(stripstr)
    data.to_csv(“adult_data_all.csv“sep=‘‘index=False)
    data = pd.read_csv(“adult_data_all.csv“)
    #remove the unknown values
    #data=data.replace(‘?‘np.nan)
    data=data[(True^data.isin([‘?‘]))]
    data=data.dropna()
    #data[]=data[].replace(‘?‘‘‘)
    data.to_csv(“clean_adult_data_all.csv“sep=‘‘index=False)

    #adult_test
    test = pd.read_csv(‘./adult.test‘names = names)
    stripstr = lambda x: x.strip() if isinstance(x str) else x
    test = test.applymap(stripstr)
    test = test.applymap(lambda x : x.strip(‘.‘) if x==‘<=50K.‘or x==‘>50K.‘ else x )
    test.drop([0]inplace=True)
    test.to_csv(“adult_test_all.csv“sep=‘‘index=False)
    test = pd.read_csv(“adult_test_all.csv“)
    #remove the unknown values
    #test=test.replace(‘?‘np.nan)
    test=test[(True^test.isin([‘?‘]))]
    test=test.dropna()
    #test[]=test[].replace(‘?‘‘‘)
    test.to_csv(“clean_adult_test_all.csv“sep=‘‘index=False)

    #concat data and test
    data = pd.read_csv(“clean_adult_data_all.csv“)
    test = pd.read_csv(“clean_adult_test_all.csv“)
    data_test=data.append(test)
    data_test.to_csv(“adult_data_test_all.csv“sep=‘‘index=False)
    #discretization
    data_test=discretization(“adult_data_test_all.csv“)
    data=data_test[:len(data)]
    test=data_test[len(data):]
    data.to_csv(“discretization_adult_data_all.csv“sep=‘‘index=False)
    test.to_csv(“discretization_adult_test_all.csv“sep=‘‘index=False)


 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件        473  2018-06-16 18:01  decisionTree\.idea\decisionTree.iml

     文件        227  2018-06-16 18:01  decisionTree\.idea\misc.xml

     文件        276  2018-06-16 00:15  decisionTree\.idea\modules.xml

     文件      34744  2018-06-23 20:35  decisionTree\.idea\workspace.xml

     文件    3974305  1996-08-09 12:00  decisionTree\adult.data

     文件       5229  2001-01-30 11:00  decisionTree\adult.names

     文件    2003153  1996-08-09 12:00  decisionTree\adult.test

     文件    1011100  2018-06-20 01:13  decisionTree\adultTree.txt

     文件    3551168  2018-06-20 01:12  decisionTree\adult_data_all.csv

     文件    4837202  2018-06-20 01:12  decisionTree\adult_data_test_all.csv

     文件    1938164  2018-06-20 01:12  decisionTree\adult_test_all.csv

     文件    3045388  2018-06-20 01:12  decisionTree\clean_adult_data_all.csv

     文件    5441989  2018-06-17 18:06  decisionTree\clean_adult_data_test_all.csv

     文件    1610978  2018-06-20 01:12  decisionTree\clean_adult_test_all.csv

     文件       2937  2018-06-20 00:24  decisionTree\dataAnalysis.py

     文件       6172  2018-06-15 23:52  decisionTree\dataAnalysis_bak.py

     文件       3069  2018-06-20 01:10  decisionTree\dataAnalysis_delete.py

     文件      12570  2018-06-17 19:08  decisionTree\data_cleaning.py

     文件       9682  2018-06-17 15:05  decisionTree\data_cleaning.pyc

     文件      14587  2018-06-17 15:45  decisionTree\data_cleaning_bak.py

     文件       5697  2018-06-17 18:05  decisionTree\data_discretization.py

     文件       5678  2018-06-17 16:18  decisionTree\data_discretization_bak.py

     文件      17617  2018-06-17 22:10  decisionTree\decision tree.docx

     文件        396  2018-06-17 16:48  decisionTree\decisionTree.py

     文件        632  2018-06-17 16:48  decisionTree\decisionTreeAnalysis.py

     文件        841  2018-06-13 20:10  decisionTree\discretization.py

     文件          0  2018-06-17 15:22  decisionTree\discretizationAnalysis.py

     文件    2974140  2018-06-20 01:12  decisionTree\discretization_adult_data_all.csv

     文件    1485226  2018-06-20 01:12  decisionTree\discretization_adult_test_all.csv

     文件       4842  2018-06-17 17:12  decisionTree\no_pruning_trees.py

............此处省略14个文件信息

评论

共有 条评论