资源简介
西电数据挖掘大作业之电影评级数据分析。
代码片段和文件信息
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
mnames = [‘movieid‘ ‘title‘ ‘genres‘]
movie = pd.read_table(‘movies.dat‘ sep=‘::‘ header=None
names=mnames engine=‘python‘)
print(‘----------------------------------------------------------------------------‘)
print(“examples of movies:\n“)
print(movie.head(5))
print(‘\nthe length of movies data:%d‘ % len(movie))
print(‘----------------------------------------------------------------------------\n‘)
unames = [‘userid‘ ‘gender‘ ‘age‘ ‘occupation‘ ‘zip‘]
user = pd.read_table(‘users.dat‘ sep=‘::‘ header=None
names=unames engine=‘python‘)
print(‘----------------------------------------------------------------------------‘)
print(“examples of users:\n“)
print(user.head(5))
print(‘\nthe length of users data:%d ‘ % len(user))
print(‘----------------------------------------------------------------------------\n‘)
rnames = [‘userid‘ ‘movieid‘ ‘ratings‘ ‘timestamp‘]
rating = pd.read_table(‘ratings.dat‘ sep=‘::‘
header=None names=rnames engine=‘python‘)
print(‘----------------------------------------------------------------------------‘)
print(“examples of ratings:\n“)
print(rating.head(5))
print(‘\nthe length of ratings data:%d ‘ % len(rating))
print(‘----------------------------------------------------------------------------\n‘)
# 把三张表的数据合并到一起
data = rating.merge(user).merge(movie)
print(‘----------------------------------------------------------------------------‘)
print(“examples of datas:\n“)
print(data.head(5))
print(‘----------------------------------------------------------------------------\n‘)
# 按性别计算每部电影的平均得分
meanratings = pd.pivot_table(data index=[‘title‘] values=‘ratings‘ columns=[
‘gender‘] aggfunc=np.mean fill_value=0)
print(‘----------------------------------------------------------------------------‘)
print(‘The average score of each film (by Gender):‘)
print(meanratings[:10])
print(‘----------------------------------------------------------------------------\n‘)
# 过滤掉评分数据不到100条的电影再按年龄统计电影的平均得分
movie_rating = data.groupby([‘title‘]).size()
# print(movie_rating[:10])
movie_ix = movie_rating.index[movie_rating > 100]
mrating = meanratings.loc[movie_ix]
# print(mrating.head())
# 女性最喜欢的电影
print(‘----------------------------------------------------------------------------‘)
print(‘Women favorite movies TOP10: ‘)
print(mrating.sort_values(by=[‘F‘] ascending=False).head(10))
print(‘----------------------------------------------------------------------------\n‘)
print(‘----------------------------------------------------------------------------‘)
print(‘Men favorite movies TOP10: ‘)
print(mrating.sort_values(by=[‘M‘] ascending=False).head(10))
print(‘----------------------------------------------------------------------------\n‘)
# 找出男性和女性评分分歧最大的电影
# 按男女评分差值的绝对值来统计
mrating[‘diff‘] = np.abs(mrating[‘F‘] - mrating[‘M‘])
#print(mrating.sort_values(by=[‘diff‘
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2018-10-18 18:07 movies\
文件 171308 2003-03-27 05:18 movies\movies.dat
文件 5006 2018-10-18 22:09 movies\movies.py
文件 24594131 2003-03-01 05:53 movies\ratings.dat
文件 5577 2016-01-30 04:39 movies\README
文件 0 2018-10-18 12:22 movies\text
文件 134368 2003-03-01 05:53 movies\users.dat
评论
共有 条评论