资源简介
kdd2015年竞赛代码全公开,预测慕课辍学率,准确率达到了接近95%。
代码片段和文件信息
# -*- coding:utf-8 -*-
‘‘‘
@lptMusketeers 2017.10.20
‘‘‘
import pickle
import pandas as pd
import numpy as np
from unittest.mock import inplace
class PreProcess(object):
def gen_courseid_dict(selfsource_path):
df = pd.read_csv(source_pathusecols=[0])
course_map = pd.factorize(df.course_id)[1]
course_dict = dict(zip(course_maprange(len(course_map))))
print (“course_dict done...“)
return course_dict
def gen_username_dict(selfsource_path_trainsource_path_test):
df = pd.read_csv(source_path_trainusecols=[1])
username_map = pd.factorize(df.username)[1]
username_dict = dict(zip(username_maprange(len(username_map))))
df2 = pd.read_csv(source_path_testusecols=[1])
username_map2 = pd.factorize(df2.username)[1]
diff = [w for w in username_map2 if w not in username_map]
username_dict2 =dict(zip(diffnp.arange(len(username_map)len(username_map)+len(diff))))
username_dict.update(username_dict2)
print (“username_dict done...“)
return username_dict
def course_map(selfx):
return self.course_dict[x]
def username_map(selfx):
return self.username_dict[x]
def time_split(selfx):
x = x[:10]
return x
def enrollment_map(selfsource_path_trainsource_path_testtarget_path_traintarget_path_test):
print (“read enrollment_train.csv“)
# enrollment_train.csv enrollment_id username course_id
df1 = pd.read_csv(source_path_trainusecols=[012]converters={1:self.username_map2:self.course_map})
df1.to_csv(target_path_trainindex=False)
df2 = pd.read_csv(source_path_testusecols=[012]converters={1:self.username_map2:self.course_map})
df2.to_csv(target_path_testindex=False)
def date_map(selfsource_pathtarget_path):
print (“read date.csv“)
df1 = pd.read_csv(source_pathconverters={0:self.course_map})
df1[“day_nums“]= (pd.to_datetime(df1[“to“]) - pd.to_datetime(df1[“from“]))
df1[“day_nums“] = df1[“day_nums“].map(lambda x: x.days)
df1.to_csv(target_pathindex=False)
def log_clean(selfsource_pathtarget_path):
print (“read log_train.csv “)
df1 = pd.read_csv(source_pathusecols=[013]) #change
df1[“date“] = df1[“time“].map(lambda x: x[:10])
df1[“time“] = df1[“time“].map(lambda x: x[11:])
df1.to_csv(target_pathindex=False)
def course_enrollment(selfsource_path_trainsource_path_testsource_path_datetarget_path_traintarget_path_test):
print(“course_enrollment....“)
df1 = pd.read_csv(source_path_train) #如果不设置index,read_csv读取是默认index(序号),不是第一列
df2 = pd.read_csv(source_path_test)
df3 = pd.read_csv(source_path_date)
df4 = pd.merge(df1df3how=“left“left_on=“course_id“right_
评论
共有 条评论