资源简介
利用python爬取微博内容,能够做到爬取任意关键字下的全部微博内容
代码片段和文件信息
# -*- coding: utf-8 -*-
import re requests
import time
from bs4 import BeautifulSoup
import sys
import json
import datetime
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
class weibo():
header = {‘Accept‘:‘text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webpimage/apng*/*;q=0.8‘‘Accept-Encoding‘:‘gzip deflate br‘‘Accept-Language‘:‘zh-CNzh;q=0.8‘‘Cookie‘:‘_T_WM=dbeb65f9c841bd67a4f32cab3ddbf7ec; ALF=1514628952; SCF=AuIwFvQ5M6uY3uNIsY0hghDyz7lZI2hJXLAmQfi-kZSxz7rrfhFd-xg1a49sZCflizdhw72KVhkyNCabj6L-AfQ.; SUB=_2A253G6oIDeRhGeBN41oR9ynFwjuIHXVU5zZArDV6PUNbktBeLWPykW1NRAPsHIqkTzNa_zyi5uR2WyUO7jV8VH6z; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWjY5CN2FR7gfTeS2hHSFmV5JpX5KMhUgL.Foq01hn7S0M41KM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMRSh2pShM0Sh-p; SUHB=0z1wvP8FOQCXXR; SSOLoginState=1512036952‘‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/60.0.3112.113 Safari/537.36‘‘Upgrade-Insecure-Requests‘:‘1‘‘Connection‘:‘keep-alive‘‘Cache-Control‘:‘max-age=0‘}
def _init_(self):
pass
def get_soup(selfurl):#返回soup
time.sleep(1.5)
req = requests.get(urlheaders=self.header)
html=req.text
soup=BeautifulSoup(html“lxml“)
return soup
def create_url(selfpagestarttime):#构造连续url
#https://weibo.cn/search/mblog?hideSearchframe=&keyword=%E4%B8%AD%E5%8D%B0%E5%AF%B9%E5%B3%99&advancedfilter=1&starttime=20170630&endtime=20170701&sort=time&page=2
url=“https://weibo.cn/search/mblog/?keyword=中印对峙&sort=time&advancedfilter=1&“
x=datetime.timedelta(days=0)
endtime=starttime+x
endtime=endtime.strftime(‘%Y%m%d‘)
starttime=starttime.strftime(‘%Y%m%d‘)
url=url+‘starttime=‘+starttime+‘&endtime=‘+endtime+‘&sort=time&page=‘+str(page)
return url
#格式 {‘time‘:time‘name‘:name‘text‘:text‘zan‘:zan‘ping‘:ping‘zhuan‘:zhuan‘comment‘:[{‘name‘:name‘text‘:text}{‘name‘:name‘text‘:text}]}
def comment_url(selfurlnum):#评论url
url=re.sub(‘#cmtfrm‘‘‘url)
url=url+‘&page=‘+str(num)
return url
def get_commentsoup(selfurl):#评论soup
time.sleep(2)
req = requests.get(urlheaders=self.header)
html=req.text
soup=BeautifulSoup(html“lxml“)
return soup
def get_comment(selflistping_url):#解析评论
ping_url=re.sub(‘http‘ ‘https‘ ping_url)
soup=self.get_soup(ping_url)
con_pagenum=soup.find(‘input‘ attrs={“name“:“mp“})
if con_pagenum==None:
con_pagenum=1
else:
con_pagenum=con_pagenum.attrs[“value“]
for i in range(1int(con_pagenum)+1):
url=self.comment_url(ping_urli)
soup=self.get_commentsoup(url)
for div in soup.find_all(“div“attrs = {“id“:re.compile(“C_.*?“)}):
name=div.find(“a“)
name=name.get_text()
相关资源
- python实现SGBM图像匹配算法
- python实现灰度直方图均衡化
- scrapy_qunar_one
- Python学习全系列教程永久可用
- python简明教程.chm
- 抽奖大转盘python的图形化界面
- 双边滤波器实验报告及代码python
- python +MYSQL+HTML实现21蛋糕网上商城
- Python-直播答题助手自动检测出题搜索
- OpenCV入门教程+OpenCV官方教程中文版
- Python 串口工具源码+.exe文件
- Python开发的全栈股票系统.zip
- Python操作Excel表格并将其中部分数据写
- python书籍 PDF
- 利用python绘制散点图
- python+labview+No1.vi
- 老男孩python项目实战
- python源码制作whl文件.rar
- python3.5可用的scipy
- PYTHON3 经典50案例.pptx
- 计算机科学导论-python.pdf
- python模拟鼠标点击屏幕
- windows鼠标自动点击py脚本
- 鱼c小甲鱼零基础学python全套课后题和
- Python 练习题100道
- Practical Programming 2nd Edition
- wxPython Application Development Cookbook
- python 3.6
- Python 3.5.2 中文文档 互联网唯一CHM版本
- python3.5.2.chm官方文档
评论
共有 条评论