# -*- coding:utf-8 -*-
from lxml import html
import requests
import json
import os
import time
import re
from time import sleep
### 爬取微博内容
class CrawlWeibo:
# 获取指定博主的所有微博cards的list
def getCards(self id page):
# id(字符串类型):博主的用户id;page(整型):微博翻页参数
ii = 0
list_cards = []
while ii < page:
ii = ii + 1
url = ‘https://m.weibo.cn/api/container/getIndex?type=uid&value=‘ + id \
+ ‘&containerid=107603‘ + id + ‘&page=‘ + str(ii)
response = requests.get(url headers=headers)
ob_json = json.loads(response.text) # ob_json为dict类型
list_card = ob_json[‘data‘][‘cards‘]
if len(list_card)==0 and ii==1:
list_cards.append(ob_json[‘data‘][‘cards‘]) # ob_json[‘data‘][‘cards‘]为list类型
return list_cards # 返回所有页的cards
# 获取某条微博的热门评论或评论的list
def getComments(self id page): # id(字符串类型):某条微博的id;page(整型):评论翻页参数
url = ‘https://m.weibo.cn/api/comments/show?id=‘ + id + ‘&page=‘ + str(page)
response = requests.get(url headers=headers)
ob_json = json.loads(response.text)
list_comments = []
if ‘data‘ in ob_json:
if ‘hot_data‘ in ob_json[‘data‘]:
list_comments = ob_json[‘data‘][‘hot_data‘]
list_comments = ob_json[‘data‘][‘data‘]
return list_comments # 返回某条微博下评论
def getAll(self id page path): # id为博主uid,page为爬取页数,path为保存路径
list_cards = self.getCards(id page)
if len(list_cards)!=0:
count_weibo = 1
page_weibo = 1
# 遍历当页所有微博,保存内容,并根据id查找输出热门评论
ff = open(path + ‘%s.txt‘%id ‘w‘ encoding=‘utf-8‘)
for cards in list_cards:
for card in cards:
if card[‘card_type‘] == 9: # 过滤出微博
# if card[‘card_type‘] == 9 and ‘raw_text‘ not in card[‘mblog‘]: # 过滤出原创微博
# print(‘正在爬取第‘ + str(page_weibo) + ‘页 第‘ + str(count_weibo) + ‘条card‘)
mid = card[‘mblog‘][‘id‘]
created_at = card[‘mblog‘][‘created_at‘]
# 获取保存文本信息
if not card[‘mblog‘][‘isLongText‘]: # card[‘mblog‘][‘isLongText‘] == ‘false‘
text = card[‘mblog‘][‘text‘]
url = ‘https://m.weibo.cn/statuses/extend?id=‘ + mid
response = requests.get(url headers=headers)
ob_json = json.loads(response.text) # ob_json为dict类型
text = ob_json[‘data‘][‘longTextContent‘]
tree = html.fromstring(text)
text = tree.xpath(‘string(.)‘) # 用string函数过滤掉多余标签
ff.write(text + ‘\n‘)
# print(text)
count_weibo = count_weibo + 1
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 5392 2019-01-29 15:31 weibo_crawl\CrawlAndDeal.py
文件 942 2018-12-22 20:08 weibo_crawl\Deal.py
文件 5665 2019-01-29 15:30 weibo_crawl\WeiboCrawl.py
目录 0 2019-01-29 15:33 weibo_crawl\weibo\
目录 0 2019-01-29 15:33 weibo_crawl\
