Python爬取小说

大小: 0.03M

文件类型: .rar

金币: 1

下载: 0 次

发布日期: 2024-05-05
语言: Python
标签: python py 小说爬取

高速下载

资源简介

Python爬取网站上的小说保存到本地txt

资源截图

小图大图

代码片段和文件信息



#!/usr/bin/python

# -*- coding:utf-8 -*-

 

import requests #抓取网页的html源码

import random   #取随机数

from bs4 import BeautifulSoup #用于代替正则式 取源码中相应标签中的内容

import sys

import time #时间操作

 

 

class downloader（object）:

    def __init__（self）:

        self.server = ‘http://www.biqukan.com‘

        self.target = ‘http://www.biqukan.com/0_790/‘

        self.names = [] #章节名

        self.urls = []  #章节链接

        self.nums = 0   #章节数

 

    “““

    获取html文档内容

    “““

    def get_content（selfurl）:

        # 设置headers是为了模拟浏览器访问 否则的话可能会被拒绝 可通过浏览器获取

        header = {

            ‘Accept‘: ‘text/htmlapplication/xhtml+xmlapplication/xml;q=0.9*/*;q=0.8‘

            ‘Connection‘: ‘keep-alive‘

            ‘Accept-Encoding‘: ‘gzip deflate‘

            ‘Accept-Language‘: ‘zh-cn‘

            ‘User-Agent‘: ‘Mozilla/5.0 （Macintosh; Intel Mac OS X 10_13_4） AppleWebKit/605.1.15 （KHTML like Gecko） Version/11.1 Safari/605.1.15‘

        }

 

        # 设置一个超时时间 取随机数 是为了防止网站被认定为爬虫

        timeout = random.choice（range（80 180））

 

        while True:

            try:

                req = requests.get（url=url headers=header timeout=timeout）

                break

            except Exception as e:

                print（‘3‘e）

                time.sleep（random.choice（range（8 15）））

        return req.text

 

    “““

    获取下载的章节目录

    “““

    def get_download_catalogue（selfurl）:

        html = self.get_content（url）

        bf = BeautifulSoup（html ‘html.parser‘）

        texts = bf.find_all（‘div‘ {‘class‘: ‘listmain‘}）

        div = texts[0]

        a_s = div.find_all（‘a‘）

        self.nums = len（a_s[12:17]）   #去掉重复的最新章节列表 不重复的前5章

        for each in a_s[12:17]:

            self.names.append（each.string）

            self.urls.append（self.server + each.get（‘href‘））

 

    “““

    获取下载的具体章节

    “““

    def get_download_content（self url）:

        html = self.get_content（url）

        bf = BeautifulSoup（html ‘html.parser‘）

        texts = bf.find_all（‘div‘ {‘class‘: ‘showtxt‘ ‘id‘: ‘content‘}）

        text = texts[0].text.replace（‘\xa0‘ * 7 ‘\n\n‘）  # \xa0表示连续的空白格

        return text

 

    “““

    将文章写入文件

    “““

    def writer（selfnamepathtext）:

        write_flag = True

        with open（path ‘a‘ encoding=‘utf-8‘） as f:

            f.write（name + ‘\n‘）

            f.writelines（text）

            f.write（‘\n\n‘）

 

if __name__ == ‘__main__‘:

    dl = downloader（）

    dl.get_download_catalogue（dl.target）

    for i in range（dl.nums）:

        dl.writer（dl.names[i] ‘天尊.txt‘ dl.get_download_content（dl.urls[i]））

        print（“已下载：%.2f%%“% float（（i+1）/dl.nums * 100） + ‘\r‘）

    print（‘下载完成！‘）

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

    ..A..H.     17920  2020-11-17 11:57  PythonApplication1\.vs\PythonApplication1\v16\.suo

     文件       3191  2020-11-17 11:57  PythonApplication1\PythonApplication1.py

     文件       1568  2020-11-17 10:23  PythonApplication1\PythonApplication1.pyproj

     文件        977  2020-11-17 10:23  PythonApplication1\PythonApplication1.sln

     文件      73590  2020-11-17 11:28  PythonApplication1\天尊.txt

     目录          0  2021-01-25 09:58  PythonApplication1\.vs\PythonApplication1\v16

     目录          0  2021-01-25 09:58  PythonApplication1\.vs\PythonApplication1

    ...D.H.         0  2021-01-25 09:58  PythonApplication1\.vs

     目录          0  2021-01-25 09:58  PythonApplication1

----------- ---------  ---------- -----  ----

                97246                    9

上一篇：NumPy Cookbook
下一篇：动物图片识别.py（基于百度api）

共有条评论

Python爬取小说

资源简介

资源截图

代码片段和文件信息

评论

相关资源