资源简介
经典又兼具备趣味性的案例泰坦尼克号问题源码。大家都熟悉的『Jack and Rose』的故事,豪华游艇倒了,大家都惊恐逃生,可是救生艇的数量有限,无法人人都有,副船长发话了『lady and kid first!』,所以是否获救其实并非随机,而是基于一些背景有rank先后的。
训练和测试数据是一些乘客的个人信息以及存活状况,要尝试根据它生成合适的模型并预测其他人的存活状况。
对,这是一个二分类问题,很多分类算法都可以解决。
代码片段和文件信息
import re
import numpy as np
import pandas as pd
import random as rd
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
np.set_printoptions(precision=4 threshold=10000 linewidth=160 edgeitems=999 suppress=True)
pd.set_option(‘display.max_columns‘ None)
pd.set_option(‘display.max_rows‘ None)
pd.set_option(‘display.width‘ 160)
pd.set_option(‘expand_frame_repr‘ False)
pd.set_option(‘precision‘ 4)
def processCabin():
global df
df[‘Cabin‘][df.Cabin.isnull()] = ‘U0‘
df[‘CabinLetter‘] = df[‘Cabin‘].map( lambda x : getCabinLetter(x))
df[‘CabinLetter‘] = pd.factorize(df[‘CabinLetter‘])[0]
if keep_binary:
cletters = pd.get_dummies(df[‘CabinLetter‘]).rename(columns=lambda x: ‘CabinLetter_‘ + str(x))
df = pd.concat([df cletters] axis=1)
df[‘CabinNumber‘] = df[‘Cabin‘].map( lambda x : getCabinNumber(x)).astype(int) + 1
if keep_scaled:
scaler = preprocessing.StandardScaler()
df[‘CabinNumber_scaled‘] = scaler.fit_transform(df[‘CabinNumber‘])
def getCabinLetter(cabin):
match = re.compile(“([a-zA-Z]+)“).search(cabin)
if match:
return match.group()
else:
return ‘U‘
def getCabinNumber(cabin):
match = re.compile(“([0-9]+)“).search(cabin)
if match:
return match.group()
else:
return 0
def processTicket():
global df
df[‘TicketPrefix‘] = df[‘Ticket‘].map( lambda x : getTicketPrefix(x.upper()))
df[‘TicketPrefix‘] = df[‘TicketPrefix‘].map( lambda x: re.sub(‘[\.?\/?]‘ ‘‘ x) )
df[‘TicketPrefix‘] = df[‘TicketPrefix‘].map( lambda x: re.sub(‘STON‘ ‘SOTON‘ x) )
df[‘TicketPrefixId‘] = pd.factorize(df[‘TicketPrefix‘])[0]
if keep_binary:
prefixes = pd.get_dummies(df[‘TicketPrefix‘]).rename(columns=lambda x: ‘TicketPrefix_‘ + str(x))
df = pd.concat([df prefixes] axis=1)
df.drop([‘TicketPrefix‘] axis=1 inplace=True)
df[‘TicketNumber‘] = df[‘Ticket‘].map( lambda x: getTicketNumber(x) )
df[‘TicketNumberDigits‘] = df[‘TicketNumber‘].map( lambda x: len(x) ).astype(np.int)
df[‘TicketNumberStart‘] = df[‘TicketNumber‘].map( lambda x: x[0:1] ).astype(np.int)
df[‘TicketNumber‘] = df.TicketNumber.astype(np.int)
if keep_scaled:
scaler = preprocessing.StandardScaler()
df[‘TicketNumber_scaled‘] = scaler.fit_transform(df[‘TicketNumber‘])
def getTicketPrefix(ticket):
match = re.compile(“([a-zA-Z\.\/]+)“).search(ticket)
if match:
return match.group()
else:
return ‘U‘
def getTicketNumber(ticket):
match = re.compile(“([\d]+$)“).search(ticket)
if match:
return match.group()
else:
return ‘0‘
def processFare():
global df
df[‘Fare‘][ np.isnan(df[‘Fare‘]) ] = df[‘Fare‘].median()
df[‘Fare‘][ np.where(df[‘Fare‘]==0)[0] ] = df[‘Fare‘][ df[‘Fare‘].nonzero()[0] ]
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2015-03-23 16:38 kaggle-Titanic\
文件 14597 2015-03-23 16:38 kaggle-Titanic\dataProcess.py
文件 72565 2015-03-23 16:38 kaggle-Titanic\figure_1.png
文件 4216 2015-03-23 16:38 kaggle-Titanic\randomForest.py
文件 200 2015-03-23 16:38 kaggle-Titanic\README.md
文件 2839 2015-03-23 16:38 kaggle-Titanic\result.csv
文件 28210 2015-03-23 16:38 kaggle-Titanic\test.csv
文件 60302 2015-03-23 16:38 kaggle-Titanic\train.csv
- 上一篇:手机号绑定百度id查询器
- 下一篇:GABOR特征提取
评论
共有 条评论