资源简介
实现对豆瓣电影网站的所有电影爬取的爬虫实例,
代码片段和文件信息
# # -*- coding:utf8 -*-
#coding:utf8
# 首先用于确定编码,加上这句
from urllib.request import urlopen
from bs4 import BeautifulSoup
#导入pymysql的包
import pymysql.cursors
import re
‘‘‘
html = urlopen(“https://movie.douban.com/“)
bsObj=BeautifulSoup(html“lxml“) #将html对象转化为BeautifulSoup对象
liList=bsObj.findAll(“li“{“class“:“poster“}) #找到所有li
for li in liList:
ul=li.parent
print(ul)
‘‘‘
x=0
k=[]
while x<=225:
k.append(“https://movie.douban.com/top250?start=“+str(x)+“&filter“)
x+=25
movie=[]
print(len(k))
for i in k:
for j in range(11000):
for kkk in range(11000):
pass
print(i)
ht= urlopen(i)
bsObj = BeautifulSoup(ht “lxml“) # 将html对象转化为BeautifulSoup对象
liList = bsObj.findAll(“div“{“class“:“item“}) # 找到所有a
for qq in liList:
#电影链接
print(“\n“qq.div.a[‘href‘])
movie.append(qq.div.a[‘href‘])
print(movie)
print(len(movie))
#movie=[‘https://movie.douban.com/subject/1292052/‘ ‘https://movie.douban.com/subject/1295644/‘]
#movie=[]
Info=[]
for movieurl in movie:
try:
#print(movieurl)
htt=urlopen(movieurl)
#print(htt.info())
for j in range(1 1000):
pass
bsObj1 = BeautifulSoup(htt “lxml“) # 将html对象转化为BeautifulSoup对象
#print(bool(bsObj1))
# liListmovie = bsObj1.findAll(“div“ {“id“: “content“}) # 找到所有div中,id=“content“
# 找到某个标签里面的内容
title = bsObj1.findAll(“span“{“property“:“v:itemreviewed“})[0].get_text()
#print(“the title is no problem!“)
# 找到img里面的src
imageurl = bsObj1.findAll(‘img‘)[0][‘src‘]
# 获取导演信息
daoyan1 = bsObj1.findAll(“span“ {“class“: “pl“})[0].get_text()
daoyan2 = bsObj1.findAll(“a“ {“rel“: “v:directedBy“})[0].get_text()
actor = daoyan1 + “:“ + daoyan2
# 获取类型信息
kinds = bsObj1.findAll(“span“ {“class“: “pl“})[3].get_text()
kind2 = bsObj1.findAll(“span“ {“property“: “v:genre“})
for lll in kind2:
kinds += “/“ + lll.get_text()
# 上映日期
#uploaddate = bsObj1.findAll(“span“ {“class“: “pl“})[6].get_text()
uploaddate1 = bsObj1.findAll(“span“ {“property“: “v:initialReleaseDate“})[0].get_text()
parttrn =r“\d{4}-\d{12}-\d{12}“
c1 = re.findall(parttrnuploaddate1)
uploaddate=c1[0]
# 剧情简介
synopsis = bsObj1.findAll(“span“ {“property“: “v:summary“})[0].get_text()
# 豆瓣电影排名
- 上一篇:python调用cplex解决tsp问题
- 下一篇:朴素贝叶斯过滤垃圾邮件源码及数据
评论
共有 条评论