python检索新浪微博.zip

大小: 0.26M

文件类型: .zip

金币: 2

下载: 1 次

发布日期: 2021-06-06
语言: Python
标签: 其他

高速下载

资源简介

python检索新浪微博.zip

资源截图

小图大图

代码片段和文件信息

# coding: utf-8

import urllib.request
import time
import random
from lxml import etree
import logging
import xlrd
from xlutils.copy import copy

# 导入所需模块
import urllib.error
import urllib.request
import urllib.parse
import re
import rsa
import http.cookiejar  # 从前的cookielib
import base64
import json
import urllib
import binascii

class CollectData（）:
    “““数据收集类
       利用微博高级搜索功能，按关键字搜集一定时间范围内的微博。
    “““

    def __init__（self keyword area startTime interval=‘50‘fileS=“weibo.csv“flag=True begin_url_per=“http://s.weibo.com/weibo/“）:
        self.begin_url_per = begin_url_per  # 设置固定地址部分
        self.setKeyword（keyword）  # 设置关键字
        self.setArea（area）  # 设置关键字
        self.setStartTimescope（startTime）  # 设置搜索的开始时间
        # self.setRegion（region）  #设置搜索区域
        self.setInterval（interval）  # 设置邻近网页请求之间的基础时间间隔（注意：过于频繁会被认为是机器人）
        self.setFileS（fileS）  # 设置邻近网页请求之间的基础时间间隔（注意：过于频繁会被认为是机器人）
        self.setFlag（flag）

    #设置关键字
    #关键字需解码后编码为utf-8
    def setKeyword（self keyword）:
        self.keyword = keyword.encode（“utf-8“）

    def setArea（self area）:
        self.area = area

    def setFileS（self fileS）:
        self.fileS = fileS

    def getKeyWord（self）:
        return urllib.parse.quote（self.keyword）

    def getArea（self）:
        return self.area

        ##设置起始范围，间隔为1天

    #格式为：yyyy-mm-dd
    def setStartTimescope（self startTime）:
        if not （startTime == ‘-‘）:
            self.timescope = startTime
        else:
            self.timescope = ‘-‘

    ##设置邻近网页请求之间的基础时间间隔
    def setInterval（self interval）:
        self.interval = int（interval）

        def setInterval（self interval）:
            self.interval = int（interval）

    #设置是否被认为机器人的标志。
    def setFlag（self flag）:
        self.flag = flag

    #构建URL
    def getURL（self）:
        return self.begin_url_per + “?q=“ + self.getKeyWord（） + “®ion=custom:“ + self.getArea（） + “&scope=ori&suball=1×cope=custom:“ + self.timescope + “&Refer=g&page=“

        ##爬取一次请求中的所有网页，最多返回50页

    def download（self url maxTryNum=4）:
        hasMore = True  # 某次请求可能少于50页，设置标记，判断是否还有下一页
        isCaught = False  # 某次请求被认为是机器人，设置标记，判断是否被抓住。抓住后，需要，进入页面，输入验证码

        i = 1  # 记录本次请求所返回的页数
        while hasMore and i < 99 and （not isCaught）:  # 最多返回98页，对每页进行解析，并写入结果文件
            source_url = url + str（i）  # 构建某页的URL
            data = ‘‘  # 存储该页的网页数据
            goon = True  # 网络中断标记
            ##网络不好的情况，试着尝试请求三次
            for tryNum in range（maxTryNum）:
                try:
                    html = urllib.request.urlopen（source_url timeout=12）
                    data = html.read（）.decode（）
                    break
                except:
                    if tryNum < （maxTryNum - 1）:
                        time.sleep（10）
                    else:
                        print（‘Internet Connect Error!‘）
                        self.flag = False

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-03-16 13:06  python检索新浪微博\
     文件      267829  2019-03-15 13:26  python检索新浪微博\readme.docx
     文件       59392  2019-03-15 13:15  python检索新浪微博\weiboData.xls
     文件        8287  2019-03-16 13:02  python检索新浪微博\xinLang.py

上一篇：魔塔
下一篇：keras .whl文件用于python3

共有条评论

python检索新浪微博.zip

资源简介

资源截图

代码片段和文件信息

评论

相关资源