• 大小: 6KB
    文件类型: .py
    金币: 1
    下载: 0 次
    发布日期: 2021-05-13
  • 语言: Python
  • 标签: 爬虫  python  微博  

资源简介

此资料是用来爬取新浪微博评论的,修改最后的uid值即可

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-
“““
Created on Wed Apr 18 13:11:58 2018

@author: qizhiliu
“““
import time
import jieba


f=open(r‘D:\test\test7.txt‘‘a+‘encoding=‘utf-8‘)
import requests#requests是一个兼容的库
import json
#from lastLine import get_last_line
#import os
import re #解析不规则文本
from lxml import html
import math
#uid=2803301701
start=time.clock()
class weibo(object):
    
    def get_weibo(selfidpage_idpage):
        url=‘https://m.weibo.cn/api/container/getIndex?uid={}&type=uid&value={}&containerid={}&page={}‘.format(ididpage_idpage)
        response=requests.get(url)
        ob_json =json.loads(response.text)
        #print (response.text)
        #print (ob_json)
        list_cards=ob_json.get(‘data‘).get(‘cards‘)
        #list_text=ob_json.get(‘text‘)
        #print (list_text)
        #print(list_cards)
        return list_cards

    def get_comments(selfidpage):
        url=‘https://m.weibo.cn/api/comments/show?id={}&page={}‘.format(idpage)
        response=requests.get(url)
        ob_json =json.loads(response.text)
        if len(ob_json)<3:
            list_comments=‘‘
        else:
            list_comments=ob_json.get(‘data‘).get(‘data‘)
       # print (list_comments)
        
        return list_comments
    def main(selfidpagepage_id):
        list_cards  = self.get_weibo(idpage_idpage)
        #print (list_cards)
        for card in list_cards:
            if card.get(‘card_type‘)==9:  #等于9的微博才不是广告
                id = card.get(‘mblog‘).get(‘id‘)
                text= card.get(‘mblog‘).get(‘text‘)
                if text!=‘‘:
                    tree=html.fromstring(text)
                    text=tree.xpath(‘string(.)‘)                  
                    text=re.sub(r‘回复.*?:‘‘‘text)
                    text=re.sub(r‘ ‘‘ ‘text)
                    text=re.sub(r“@.* “‘‘text)
                    text = jieba.cut(text)
                    text=“ “.join(text)
                    #f.write(“***“)
                    #f.write(‘@@@微博‘)
                    f.write(text)
                    f.write(‘\n‘)
                else:
                    pass
                b=1
                #tree=html.fromstring(text)
                #text=tree.xpath(‘string(.)‘)
                while True:
                    list_comments=weibo.get_comments(idb)#获取博文对应的评论界面
                    b+=1
                    if b+1%10==0:
                        print(‘成功爬取100页评论‘)
                    if len(list_comments)<1:
                        break
                    else:
                        
                        count_hotcomments = 1
                        for comment in list_comments:
                 #           user_id = comment.get(‘user_id‘)
                  #          created_at = comment.get(‘created_at‘)
                            #link_counts = comment.get(‘like_counts‘)
                            text = comment.get(‘text‘)
               

评论

共有 条评论