资源简介

平台部分主要是hadoop分布式系统,基于该系统融合了组件Spark,Hbase,Hive,Sqoop,Mahout等。继而进行相关的数据分析
该项目主要分为以下几部分:
1:数据采集
主要是基于豆瓣电影的数据,进行分析,所以首先要爬取相关的电影数据,对应的源代码在DouBan_Spider目录下,主要是采用Python + BeautifulSoup + urllib进行数据采集
2:ETL预处理
3:数据分析
4:可视化
代码封装完好,
适用于对作影视感情分析,影评分析,电影类型分析,推荐系统的建立

资源截图

代码片段和文件信息

# -*-coding:utf-8-*-

from urllib import request
from download import down_html
from parase import parase_html
from output import output_all
from url_manager import manage_url
root_url = “https://movie.douban.com/tag/?view=cloud“

class DouBan_Spider(object):
    def __init__(self):
        self.category_dic = {} #用来存储所有热门分类的名字和对应的电影数目catename和catename_num两个属性
        self.down_class = down_html.DownHtml()  #下载网页
        self.parase_class = parase_html.ParaseHtml() #解析网页
        self.output_class = output_all.OutPut() #存储信息
        self.manage_class = manage_url.UrlManager() #链接管理
        self.tag_right = 1
        self.tag_error = 0

    #获取分类下所有热门分类
    def get_hotcategory(selfurl):
        print(“get all category!“)
        page_content = self.down_class.download(url)
        self.category_dic = self.parase_class.parase_category(page_content)
        self.output_class.output_category(self.category_dic) #将类别信息写入本地文件

    #得到某个类别下所有电影的链接
    def get_one_cate_all_movie_href(selftag_url):
        page_content = self.down_class.download(tag_url)
        page_num = self.parase_class.parase_pagenum(page_content) #得到该分类总共多少页
        movies_href = []
        try:
            for page in range(int(page_num)):
                page_url = “https://movie.douban.com/tag/%E8%8B%B1%E5%9B%BD?start=“+str(page*20)+“&type=T“
                tag_page_content = self.down_class.download(page_url)
                movies_href = self.parase_class.parase_page_all_movies(tag_page_contentmovies_href)
                print(“all:“page_num“  right:“self.tag_right“  error:“self.tag_error“  page:“page+1“  URL 获取完毕“)
                self.tag_right +=1
        except Exception as e:
            print(e)
            self.tag_error+=1
            pass
        print(“该类别下对应的电影数目为:\t“len(movies_href))
        self.output_class.output_all_movies_href(movies_href)
        print(“开始获取该类别下对应的电影信息:\n “)
        self.get_one_movie_message(movies_href)      #该类别对应的链接抓取完毕,进行这些链接对应电影信息的抓取

    #下载每个电影的详细信息
    def get_one_movie_message(selfmovie_link):
        all_count = 1
        error = 0
        self.manage_class.add_new_urls(movie_link) #将一个类别对应的链接全部加载到manage_url管理的新的链接中
        while(self.manage_class.has_new_url()):
            try:
                one_url = self.manage_class.get_new_url()  #获取一个url
                # one_url=“ https://movie.douban.com/subject/1297970/“
                print(“Right:“all_count“  URL:“one_url.strip()“  ““Error:“error)
                page_content_one = self.down_class.download(one_url)  #下载该网页对应的源代码
                one_movic_dic = self.parase_class.parase_one_movie_message(page_content_one) #解析得到一部电影的具体数据
                id = one_url.split(“/“)[-2]
                self.output_class.output_one_movie_message(one_movic_dicid) #将该部电影的数据输出
                all_count+=1
            except Exception as e:
                error +=1
                print(e)
                pass

    #获取每部电影的短评
    def get_one_movie_short_dis(selfmovie_url):
  

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\.idea\
     文件         918  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\.idea\DouBan_Spider.iml
     文件         545  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\.idea\misc.xml
     文件         961  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\.idea\modules.xml
     文件         183  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\.idea\vcs.xml
     文件       48555  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\.idea\workspace.xml
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\download\
     文件           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\download\__init__.py
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\
     文件         153  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\__init__.cpython-34.pyc
     文件        3914  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\down_html.cpython-34.pyc
     文件        5884  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\download\down_html.py
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\
     文件        3198  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\category.csv
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\
     文件       68500  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\5045678 .txt
     文件     1143200  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie.csv
     文件     1449138  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie_summary.txt
     文件     6072632  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\movies_links.csv
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\
     文件     2042984  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\5045678 .txt
     文件        6343  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\main.py
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\output\
     文件           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\output\__init__.py
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\
     文件         151  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\__init__.cpython-34.pyc
     文件        2790  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\output_all.cpython-34.pyc
     文件        2762  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\output\output_all.py
     目录           0  2019-11-26 02:25  Douban_Movies_Analysic-master\DouBan_Spider\parase\
............此处省略113个文件信息

评论

共有 条评论