-
大小: 5.28MB文件类型: .zip金币: 1下载: 0 次发布日期: 2023-10-04
- 语言: Python
- 标签:
资源简介
网站图片爬虫(已包含:微博,微信公众号,花瓣网)及免费IP代理 豆瓣电影爬虫
代码片段和文件信息
#encoding:utf-8
import requests
import json
import ossystime
from lxml import etree
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from bs4 import BeautifulSoup
import re
reload(sys)
sys.setdefaultencoding(“utf-8“)
LANGUAGES_RE = re.compile(ur“语言: (.+?)
“)
COUNTRIES_RE = re.compile(ur“制片国家/地区: (.+?)
“)
ALTERNATE_NAME_RE = re.compile(ur“又名: (.+?)
“)
RELEASE_TIME_RE = re.compile(ur“上映日期: (.+?)
“)
NUM_RE = re.compile(r“(\d+)“)
data_save_file = “douban_donghua_results.txt“
headers = {
‘Accept‘:‘*/*‘
‘Accept-Encoding‘:‘gzip deflate br‘
‘Accept-Language‘:‘zh-CNzh;q=0.8en;q=0.6‘
‘Connection‘:‘keep-alive‘
‘Host‘:‘movie.douban.com‘
‘Referer‘:‘https://movie.douban.com/explore‘
‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML like Gecko) Chrome/59.0.3071.115 Safari/537.36‘
‘X-Requested-With‘:‘xmlHttpRequest‘
}
def get_item_list(d_urld_typed_tagd_sortd_page_limitd_page_start):
params = {}
params[“type“] = d_type
params[“tag“] = d_tag
if d_sort != ““:
params[“sort“] = d_sort
params[“page_limit“] = d_page_limit
params[“page_start“] = d_page_start
response = requests.get(d_urlheaders = headersparams = paramstimeout =10)
json_obj = response.json()
json_array = json_obj[“subjects“]
return json_array
def get_item_list_from_newsearch(d_urld_sortd_ranged_tagd_page_start):
params = {}
params[“sort“] = d_sort
params[“tags“] = d_tag
params[“range“] = d_range
params[“start“] = d_page_start
response = requests.get(d_urlheaders = headersparams = paramstimeout =10)
json_obj = response.json()
json_array = json_obj[“data“]
return json_array
def get_item_detail(item_detail_url):
result_obj = {}
result_obj[“subject_id“] = int(item_detail_url.split(“/“)[-2])
celebrities_url = “https://movie.douban.com/subject/“+str(result_obj[“subject_id“])+“/celebrities“
(directors_cn_namesdirectors_en_namesactors_cn_namesactors_en_names)=get_directors_and_actors(celebrities_url)
result_obj[“directors_cn_names“] = directors_cn_names
result_obj[“directors_en_names“] = directors_en_names
result_obj[“actors_cn_names“] = actors_cn_names
result_obj[“actors_en_names“] = actors_en_names
response = requests.get(item_detail_urlheaders = headerstimeout = 10)
selector = etree.HTML(response.text)
s_response = HtmlResponse(url=item_detail_urlbody = response.textencoding=‘utf-8‘)
name = s_response.selector.xpath(“//title/text()“).extract()
if name: result_obj[“movie_name“] = name[0].replace(u“ (豆瓣)“ ““).strip()
genres = s_response.selector.xpath(“//span[@property=‘v:genre‘]/text()“).extract()
if genres: result_obj[“genres“] = genres
S = ““.join(s_response.selector.xpath(“//div[@id=‘info‘]“).extract())
M = COUNTRIES_RE.search(S)
if M is not None:
result_obj[“countries“] = [country.strip() for country in M.group(1).split(“/“)]
L = LANGUAGES_RE.search(S)
if L is not None:
result_obj[“languages“] = [ lang.st
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-10-09 07:55 image_crawler-master\
目录 0 2017-10-09 07:55 image_crawler-master\DoubanMovie\
文件 6307 2017-10-09 07:55 image_crawler-master\DoubanMovie\movie_crawler.py
文件 1850 2017-10-09 07:55 image_crawler-master\DoubanMovie\write_to_mysql.py
目录 0 2017-10-09 07:55 image_crawler-master\Huaban\
文件 4257 2017-10-09 07:55 image_crawler-master\Huaban\explain.md
文件 4719 2017-10-09 07:55 image_crawler-master\Huaban\huaban_crawler.py
文件 1437852 2017-10-09 07:55 image_crawler-master\Huaban\huaban_travel_places_result.txt
目录 0 2017-10-09 07:55 image_crawler-master\IpProxy\
目录 0 2017-10-09 07:55 image_crawler-master\IpProxy\Ip181FreeProxy\
文件 1086 2017-10-09 07:55 image_crawler-master\IpProxy\Ip181FreeProxy\get_ip181.py
目录 0 2017-10-09 07:55 image_crawler-master\IpProxy\KuaiFreeProxy\
文件 1088 2017-10-09 07:55 image_crawler-master\IpProxy\KuaiFreeProxy\get_kuaifreeproxy.py
目录 0 2017-10-09 07:55 image_crawler-master\IpProxy\XunFreeProxy\
文件 1155 2017-10-09 07:55 image_crawler-master\IpProxy\XunFreeProxy\get_xunfreeproxy.py
文件 714 2017-10-09 07:55 image_crawler-master\README.md
目录 0 2017-10-09 07:55 image_crawler-master\SinaWeibo\
文件 10883820 2017-10-09 07:55 image_crawler-master\SinaWeibo\chromedriver
文件 30151 2017-10-09 07:55 image_crawler-master\SinaWeibo\image_result.md
文件 8873 2017-10-09 07:55 image_crawler-master\SinaWeibo\weibo_crawler.py
文件 5080 2017-10-09 07:55 image_crawler-master\SinaWeibo\weibo_hot_topic_crawler.py
目录 0 2017-10-09 07:55 image_crawler-master\WechatOfficialAccounts\
文件 2333 2017-10-09 07:55 image_crawler-master\WechatOfficialAccounts\spider_wechat_official_accounts.py
相关资源
- Python-WenshuSpiderScrapy框架爬取中国裁判
- Python-智联51job招聘需求挖掘采集和分
- Python-指定用户的所有抖音视频以及收
- Python-淘宝天猫商品数据抓取代码和
- Python-Boss直聘Python招聘岗位信息爬取和
- Python-精准的百度指数抓取综合已有百
- Python-Python爬虫京东自动打码登录指定
- Python-百度指数爬虫可以自定义时间段
- Python-爬取各种开源软件的官方仓库历
- Python-微信公众号历史文章爬取api
- Python-readwx爬取微信公众号爬取搜狗微
- Python-指定的抖音Douyin号的视频抖音爬
- Python-用于爬取万方数据库文献摘要数
- Python-爬取微信公众号文章
评论
共有 条评论