• 大小: 10.63MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-10-08
  • 语言: 其他
  • 标签: 数据分析  python  

资源简介

根据豆瓣所有的电影,分析各国各地区各类别时间年份评分数量等各个参数之间的联系,大体上进行分析,主要比较世界电影和中国,以及中国大陆和中国港台电影之间的差别,分析各参数之间是否存在关联性及对评分产生的影响;数据来源于豆瓣,我对评分不做主观表现,我只对数据进行分析展示,能力偏弱,但图像不弱。

资源截图

代码片段和文件信息

# -*-coding:utf-8-*-

from urllib import request
from download import down_html
from parase import parase_html
from output import output_all
from url_manager import manage_url
root_url = “https://movie.douban.com/tag/?view=cloud“

class DouBan_Spider(object):
    def __init__(self):
        self.category_dic = {} #用来存储所有热门分类的名字和对应的电影数目catename和catename_num两个属性
        self.down_class = down_html.DownHtml()  #下载网页
        self.parase_class = parase_html.ParaseHtml() #解析网页
        self.output_class = output_all.OutPut() #存储信息
        self.manage_class = manage_url.UrlManager() #链接管理
        self.tag_right = 1
        self.tag_error = 0

    #获取分类下所有热门分类
    def get_hotcategory(selfurl):
        print(“get all category!“)
        page_content = self.down_class.download(url)
        self.category_dic = self.parase_class.parase_category(page_content)
        self.output_class.output_category(self.category_dic) #将类别信息写入本地文件

    #得到某个类别下所有电影的链接
    def get_one_cate_all_movie_href(selftag_url):
        page_content = self.down_class.download(tag_url)
        page_num = self.parase_class.parase_pagenum(page_content) #得到该分类总共多少页
        movies_href = []
        try:
            for page in range(int(page_num)):
                page_url = “https://movie.douban.com/tag/%E8%8B%B1%E5%9B%BD?start=“+str(page*20)+“&type=T“
                tag_page_content = self.down_class.download(page_url)
                movies_href = self.parase_class.parase_page_all_movies(tag_page_contentmovies_href)
                print(“all:“page_num“  right:“self.tag_right“  error:“self.tag_error“  page:“page+1“  URL 获取完毕“)
                self.tag_right +=1
        except Exception as e:
            print(e)
            self.tag_error+=1
            pass
        print(“该类别下对应的电影数目为:\t“len(movies_href))
        self.output_class.output_all_movies_href(movies_href)
        print(“开始获取该类别下对应的电影信息:\n “)
        self.get_one_movie_message(movies_href)      #该类别对应的链接抓取完毕,进行这些链接对应电影信息的抓取

    #下载每个电影的详细信息
    def get_one_movie_message(selfmovie_link):
        all_count = 1
        error = 0
        self.manage_class.add_new_urls(movie_link) #将一个类别对应的链接全部加载到manage_url管理的新的链接中
        while(self.manage_class.has_new_url()):
            try:
                one_url = self.manage_class.get_new_url()  #获取一个url
                # one_url=“ https://movie.douban.com/subject/1297970/“
                print(“Right:“all_count“  URL:“one_url.strip()“  ““Error:“error)
                page_content_one = self.down_class.download(one_url)  #下载该网页对应的源代码
                one_movic_dic = self.parase_class.parase_one_movie_message(page_content_one) #解析得到一部电影的具体数据
                id = one_url.split(“/“)[-2]
                self.output_class.output_one_movie_message(one_movic_dicid) #将该部电影的数据输出
                all_count+=1
            except Exception as e:
                error +=1
                print(e)
                pass

    #获取每部电影的短评
    def get_one_movie_short_dis(selfmovie_url):
  

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\.idea\
     文件         918  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\.idea\DouBan_Spider.iml
     文件         545  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\.idea\misc.xml
     文件         961  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\.idea\modules.xml
     文件         183  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\.idea\vcs.xml
     文件       48555  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\.idea\workspace.xml
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\download\
     文件           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\download\__init__.py
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\
     文件         153  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\__init__.cpython-34.pyc
     文件        3914  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\down_html.cpython-34.pyc
     文件        5884  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\download\down_html.py
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\
     文件        3198  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\category.csv
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\
     文件       68500  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\5045678 .txt
     文件     1143200  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie.csv
     文件     1449138  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie_summary.txt
     文件     6072632  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\movies_links.csv
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\
     文件     2042984  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\5045678 .txt
     文件        6343  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\main.py
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\output\
     文件           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\output\__init__.py
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\
     文件         151  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\__init__.cpython-34.pyc
     文件        2790  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\output_all.cpython-34.pyc
     文件        2762  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\output\output_all.py
     目录           0  2016-09-21 09:03  Douban_Movies_Analysic-master\DouBan_Spider\parase\
............此处省略113个文件信息

评论

共有 条评论