资源简介

泰坦尼克python数据分析,带数据集和源代码,强烈推荐。

资源截图

代码片段和文件信息

import re
import numpy as np
import pandas as pd
import random as rd
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

np.set_printoptions(precision=4 threshold=10000 linewidth=160 edgeitems=999 suppress=True)
pd.set_option(‘display.max_columns‘ None)
pd.set_option(‘display.max_rows‘ None)
pd.set_option(‘display.width‘ 160)
pd.set_option(‘expand_frame_repr‘ False)
pd.set_option(‘precision‘ 4)
    

def processCabin():   
    global df
    df[‘Cabin‘][df.Cabin.isnull()] = ‘U0‘
    df[‘CabinLetter‘] = df[‘Cabin‘].map( lambda x : getCabinLetter(x))
    df[‘CabinLetter‘] = pd.factorize(df[‘CabinLetter‘])[0]

    if keep_binary:
        cletters = pd.get_dummies(df[‘CabinLetter‘]).rename(columns=lambda x: ‘CabinLetter_‘ + str(x))
        df = pd.concat([df cletters] axis=1)

    df[‘CabinNumber‘] = df[‘Cabin‘].map( lambda x : getCabinNumber(x)).astype(int) + 1
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df[‘CabinNumber_scaled‘] = scaler.fit_transform(df[‘CabinNumber‘])


def getCabinLetter(cabin):
    match = re.compile(“([a-zA-Z]+)“).search(cabin)
    if match:
        return match.group()
    else:
        return ‘U‘


def getCabinNumber(cabin):
    match = re.compile(“([0-9]+)“).search(cabin)
    if match:
        return match.group()
    else:
        return 0


def processTicket():
    global df
    
    df[‘TicketPrefix‘] = df[‘Ticket‘].map( lambda x : getTicketPrefix(x.upper()))
    df[‘TicketPrefix‘] = df[‘TicketPrefix‘].map( lambda x: re.sub(‘[\.?\/?]‘ ‘‘ x) )
    df[‘TicketPrefix‘] = df[‘TicketPrefix‘].map( lambda x: re.sub(‘STON‘ ‘SOTON‘ x) ) 
    df[‘TicketPrefixId‘] = pd.factorize(df[‘TicketPrefix‘])[0]
    
    if keep_binary:
        prefixes = pd.get_dummies(df[‘TicketPrefix‘]).rename(columns=lambda x: ‘TicketPrefix_‘ + str(x))
        df = pd.concat([df prefixes] axis=1)
    
    df.drop([‘TicketPrefix‘] axis=1 inplace=True)
    
    df[‘TicketNumber‘] = df[‘Ticket‘].map( lambda x: getTicketNumber(x) )
    df[‘TicketNumberDigits‘] = df[‘TicketNumber‘].map( lambda x: len(x) ).astype(np.int)
    df[‘TicketNumberStart‘] = df[‘TicketNumber‘].map( lambda x: x[0:1] ).astype(np.int)
    
    df[‘TicketNumber‘] = df.TicketNumber.astype(np.int)
     
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df[‘TicketNumber_scaled‘] = scaler.fit_transform(df[‘TicketNumber‘])


def getTicketPrefix(ticket):
    match = re.compile(“([a-zA-Z\.\/]+)“).search(ticket)
    if match:
        return match.group()
    else:
        return ‘U‘

def getTicketNumber(ticket):
    match = re.compile(“([\d]+$)“).search(ticket)
    if match:
        return match.group()
    else:
        return ‘0‘


def processFare():
    global df           
    df[‘Fare‘][ np.isnan(df[‘Fare‘]) ] = df[‘Fare‘].median()
    df[‘Fare‘][ np.where(df[‘Fare‘]==0)[0] ] = df[‘Fare‘][ df[‘Fare‘].nonzero()[0] ]

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2015-03-23 08:38  kaggle-Titanic-master\
     文件         200  2015-03-23 08:38  kaggle-Titanic-master\README.md
     文件       14597  2015-03-23 08:38  kaggle-Titanic-master\dataProcess.py
     文件       72565  2015-03-23 08:38  kaggle-Titanic-master\figure_1.png
     文件        4216  2015-03-23 08:38  kaggle-Titanic-master\randomForest.py
     文件        2839  2015-03-23 08:38  kaggle-Titanic-master\result.csv
     文件       28210  2015-03-23 08:38  kaggle-Titanic-master\test.csv
     文件       60302  2015-03-23 08:38  kaggle-Titanic-master\train.csv

评论

共有 条评论