• 大小: 7KB
    文件类型: .py
    金币: 2
    下载: 1 次
    发布日期: 2021-05-12
  • 语言: Python
  • 标签: python  微博  

资源简介

利用python爬取微博内容,能够做到爬取任意关键字下的全部微博内容

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-
import re requests
import time
from bs4 import BeautifulSoup
import sys
import json
import datetime
reload(sys)
sys.setdefaultencoding(‘utf-8‘)

class weibo():
    header = {‘Accept‘:‘text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webpimage/apng*/*;q=0.8‘‘Accept-Encoding‘:‘gzip deflate br‘‘Accept-Language‘:‘zh-CNzh;q=0.8‘‘Cookie‘:‘_T_WM=dbeb65f9c841bd67a4f32cab3ddbf7ec; ALF=1514628952; SCF=AuIwFvQ5M6uY3uNIsY0hghDyz7lZI2hJXLAmQfi-kZSxz7rrfhFd-xg1a49sZCflizdhw72KVhkyNCabj6L-AfQ.; SUB=_2A253G6oIDeRhGeBN41oR9ynFwjuIHXVU5zZArDV6PUNbktBeLWPykW1NRAPsHIqkTzNa_zyi5uR2WyUO7jV8VH6z; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWjY5CN2FR7gfTeS2hHSFmV5JpX5KMhUgL.Foq01hn7S0M41KM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMRSh2pShM0Sh-p; SUHB=0z1wvP8FOQCXXR; SSOLoginState=1512036952‘‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/60.0.3112.113 Safari/537.36‘‘Upgrade-Insecure-Requests‘:‘1‘‘Connection‘:‘keep-alive‘‘Cache-Control‘:‘max-age=0‘}
    
    def _init_(self):
        pass
        
    def get_soup(selfurl):#返回soup
        time.sleep(1.5)
        req = requests.get(urlheaders=self.header)
        html=req.text
        soup=BeautifulSoup(html“lxml“)
        return soup
    
    
    
    def create_url(selfpagestarttime):#构造连续url
        #https://weibo.cn/search/mblog?hideSearchframe=&keyword=%E4%B8%AD%E5%8D%B0%E5%AF%B9%E5%B3%99&advancedfilter=1&starttime=20170630&endtime=20170701&sort=time&page=2
        url=“https://weibo.cn/search/mblog/?keyword=中印对峙&sort=time&advancedfilter=1&“
        x=datetime.timedelta(days=0)
        endtime=starttime+x
        endtime=endtime.strftime(‘%Y%m%d‘)
        starttime=starttime.strftime(‘%Y%m%d‘)
        url=url+‘starttime=‘+starttime+‘&endtime=‘+endtime+‘&sort=time&page=‘+str(page)
        return url
        
        #格式 {‘time‘:time‘name‘:name‘text‘:text‘zan‘:zan‘ping‘:ping‘zhuan‘:zhuan‘comment‘:[{‘name‘:name‘text‘:text}{‘name‘:name‘text‘:text}]}
    def comment_url(selfurlnum):#评论url
        url=re.sub(‘#cmtfrm‘‘‘url)
        url=url+‘&page=‘+str(num)
        return url
    def get_commentsoup(selfurl):#评论soup
        time.sleep(2)
        req = requests.get(urlheaders=self.header)
        html=req.text
        soup=BeautifulSoup(html“lxml“)
        return soup
    
    def get_comment(selflistping_url):#解析评论
        ping_url=re.sub(‘http‘ ‘https‘ ping_url)
        soup=self.get_soup(ping_url)
        con_pagenum=soup.find(‘input‘ attrs={“name“:“mp“})
        if con_pagenum==None:
            con_pagenum=1
        else:
            con_pagenum=con_pagenum.attrs[“value“]
        for i in range(1int(con_pagenum)+1):
            url=self.comment_url(ping_urli)
            soup=self.get_commentsoup(url)
            for div in soup.find_all(“div“attrs = {“id“:re.compile(“C_.*?“)}):
                name=div.find(“a“)
                name=name.get_text()

评论

共有 条评论