资源简介
微信公众号历史文章爬取api
代码片段和文件信息
# -*- encoding: utf-8 -*-
# !/usr/bin/python3
# @Time : 2019/6/21 11:14
# @File : api.py
from json import loads
from urllib.request import Request
from urllib.request import urlopen
from ssl import _create_unverified_context
from urllib.parse import urlencode
from settings import USER_AGENT
def get_history_api(**kwargs):
“““
获取公众号历史文章的 api 接口
:param biz: 公众号的识别码
:param uin: 登陆的微信账号的识别码
:param key: 获取历史信息必要的 key
:param offset: 偏移量
:param count: 历史图文发布的次数,一次是多图文,最大值10,即获取偏移量后最近10次发布的所有图文消息
:return: 解析好的json格式字典
“““
def match_item_info(item_dict article_publish_time):
“““
文章详情获取
:param item_dict: 包含单个文章信息的字典
:return: 结构化的文章信息
“““
article_title = item_dict.get(‘title‘ ‘‘)
article_author = item_dict.get(“author“ ““)
article_digest = item_dict.get(“digest“ ““)
article_content_url = item_dict.get(“content_url“ ““).replace(“&“ “&“)
article_cover_url = item_dict.get(“cover“ ““).replace(“&“ “&“)
article_source_url = item_dict.get(“source_url“ ““).replace(“&“ “&“)
copyright_stat = item_dict.get(“copyright_stat“ 0)
copy_right = 1 if copyright_stat == 11 else 0
return {
“article_title“: article_title # 文章标题
“article_author“: article_author # 文章作者
“article_publish_time“: article_publish_time # 文章发布时间
“article_digest“: article_digest # 文章摘要
“article_content_url“: article_content_url # 文章详情链接
“article_cover_url“: article_cover_url # 封面图片链接
“article_source_url“: article_source_url # 源文链接
“article_copy_right“: copy_right # 原创
}
uri_api = “http://mp.weixin.qq.com/mp/profile_ext“
form_data = {
“action“: “getmsg“
“__biz“: kwargs[“biz“]
“offset“: kwargs[“offset“]
“count“: kwargs.get(“count“ 10)
“uin“: kwargs[“uin“]
“key“: kwargs[“key“]
“f“: “json“
}
request = Request(uri_api data=urlencode(form_data).encode() headers={
“User-Agent“: USER_AGENT
})
resp_json = loads(urlopen(request context=_create_unverified_context()).read().decode() encoding=“utf-8“)
article_infos = []
next_offset = h_offset = kwargs[“offset“]
ending = False
status = 200 if resp_json.get(“errmsg“ ““) == “ok“ else 500
if status == 200:
next_offset = resp_json.get(“next_offset“ -1)
if next_offset == h_offset:
ending = True
if next_offset == -1:
next_offset = h_offset
status = 500
general_msg_list = resp_json.get(“general_msg_list“ ““)
if general_msg_list and status == 200:
general_msg_list = loads(general_msg_list encoding=“utf-8“).get(‘list‘ [])
for general_msg in general_msg_list:
publish_time = general_msg[“comm_msg_info“].get(“datetime“ 0)
app_msg
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2019-07-15 07:51 weixin-platform-history-article-api-master\
文件 1208 2019-07-15 07:51 weixin-platform-history-article-api-master\.gitignore
文件 3731 2019-07-15 07:51 weixin-platform-history-article-api-master\README.md
文件 8328 2019-07-15 07:51 weixin-platform-history-article-api-master\api.py
文件 1110 2019-07-15 07:51 weixin-platform-history-article-api-master\fiddler抓取微信PC端的uin与公众号key.md
目录 0 2019-07-15 07:51 weixin-platform-history-article-api-master\images\
文件 281674 2019-07-15 07:51 weixin-platform-history-article-api-master\images\0190628170354.png
文件 85039 2019-07-15 07:51 weixin-platform-history-article-api-master\images\0628170608.png
文件 225307 2019-07-15 07:51 weixin-platform-history-article-api-master\images\28171406.png
文件 112236 2019-07-15 07:51 weixin-platform-history-article-api-master\images\90628170758.png
文件 647 2019-07-15 07:51 weixin-platform-history-article-api-master\settings.py
文件 1307 2019-07-15 07:51 weixin-platform-history-article-api-master\simple_example.py
评论
共有 条评论