• 大小: 56.18MB
    文件类型: .7z
    金币: 1
    下载: 0 次
    发布日期: 2022-11-08
  • 语言: 其他
  • 标签: kdd2015  

资源简介

kdd2015年竞赛代码全公开,预测慕课辍学率,准确率达到了接近95%。

资源截图

代码片段和文件信息

# -*- coding:utf-8 -*-

‘‘‘
@lptMusketeers 2017.10.20
‘‘‘
import pickle
import pandas as pd
import numpy as np
from unittest.mock import inplace

class PreProcess(object):
    def gen_courseid_dict(selfsource_path):
        df = pd.read_csv(source_pathusecols=[0])
        course_map = pd.factorize(df.course_id)[1]
        course_dict = dict(zip(course_maprange(len(course_map))))
        print (“course_dict done...“)
        return course_dict
    
    def gen_username_dict(selfsource_path_trainsource_path_test):
        df = pd.read_csv(source_path_trainusecols=[1])
        username_map = pd.factorize(df.username)[1]
        username_dict = dict(zip(username_maprange(len(username_map))))
        
        df2 = pd.read_csv(source_path_testusecols=[1])
        username_map2 = pd.factorize(df2.username)[1]
        diff = [w for w in username_map2 if w not in username_map]
        username_dict2 =dict(zip(diffnp.arange(len(username_map)len(username_map)+len(diff))))
        
        username_dict.update(username_dict2)
        print (“username_dict done...“)
        return username_dict
    
    def course_map(selfx):
        return self.course_dict[x]
    
    def username_map(selfx):
        return self.username_dict[x]
    
    def time_split(selfx):
        x = x[:10]
        return x
    
    def enrollment_map(selfsource_path_trainsource_path_testtarget_path_traintarget_path_test):
        print (“read enrollment_train.csv“)
        # enrollment_train.csv enrollment_id username course_id
        df1 = pd.read_csv(source_path_trainusecols=[012]converters={1:self.username_map2:self.course_map})
        df1.to_csv(target_path_trainindex=False)
        df2 = pd.read_csv(source_path_testusecols=[012]converters={1:self.username_map2:self.course_map})
        df2.to_csv(target_path_testindex=False)
        
    def date_map(selfsource_pathtarget_path):
        print (“read date.csv“)
        df1 = pd.read_csv(source_pathconverters={0:self.course_map})
        df1[“day_nums“]= (pd.to_datetime(df1[“to“]) - pd.to_datetime(df1[“from“]))
        df1[“day_nums“] = df1[“day_nums“].map(lambda x: x.days)
        df1.to_csv(target_pathindex=False)
    
    def log_clean(selfsource_pathtarget_path):
        print (“read log_train.csv “)
        df1 = pd.read_csv(source_pathusecols=[013]) #change
        df1[“date“] = df1[“time“].map(lambda x: x[:10])
        df1[“time“] = df1[“time“].map(lambda x: x[11:])
        
        df1.to_csv(target_pathindex=False)
    
    def course_enrollment(selfsource_path_trainsource_path_testsource_path_datetarget_path_traintarget_path_test):
        print(“course_enrollment....“)
        df1 = pd.read_csv(source_path_train) #如果不设置index,read_csv读取是默认index(序号),不是第一列
        df2 = pd.read_csv(source_path_test)
        df3 = pd.read_csv(source_path_date)
        df4 = pd.merge(df1df3how=“left“left_on=“course_id“right_

评论

共有 条评论

相关资源