百度文库PDF爬虫

大小: 11.46MB

文件类型: .rar

金币: 1

下载: 0 次

发布日期: 2023-07-26
语言: Python
标签: 文库 pdf

高速下载

资源简介

一个基于Python的百度文库爬虫，主要实现文库pdf文件的下载。

资源截图

小图大图

代码片段和文件信息

import requests
import re
import argparse
import json
import os

parser = argparse.ArgumentParser（）
parser.add_argument（“url“ help=“Target Url你所需要文档的URL“ type=str）
parser.add_argument（‘type‘ help=“Target Type你所需要文档的的类型（DOC|PPT|TXT|PDF）“ type=str）
args = parser.parse_args（）

url = args.url
type = args.type

# 根据文件决定函数
y = 0


def DOC（url）:
    doc_id = re.findall（‘view/（.*）.html‘ url）[0]
    html = requests.get（url）.text
    lists = re.findall（‘（https.*?0.json.*?）\\\\x22}‘ html）
    lenth = （len（lists） // 2）
    NewLists = lists[:lenth]
    for i in range（len（NewLists））:
        NewLists[i] = NewLists[i].replace（‘\\‘ ‘‘）
        txts = requests.get（NewLists[i]）.text
        txtlists = re.findall（‘“c“:“（.*?）“.*?“y“:（.*?）‘ txts）
        for i in range（0 len（txtlists））:
            global y
            print（txtlists[i][0].encode（‘utf-8‘）.decode（‘unicode_escape‘ ‘ignore‘））
            if y != txtlists[i][1]:
                y = txtlists[i][1]
                n = ‘\n‘
            else:
                n = ‘‘
            filename = doc_id + ‘.txt‘
            with open（filename ‘a‘ encoding=‘utf-8‘） as f:
                f.write（n + txtlists[i][0].encode（‘utf-8‘）.decode（‘unicode_escape‘ ‘ignore‘）.replace（‘\\‘ ‘‘））
        print（“文档保存在“ + filename）


def PPT（url）:
    doc_id = re.findall（‘view/（.*）.html‘ url）[0]
    url = “https://wenku.baidu.com/browse/getbcsurl?doc_id=“ + doc_id + “&pn=1&rn=99999&type=ppt“
    html = requests.get（url）.text
    lists = re.findall（‘{“zoom“:“（.*?）““page“‘ html）
    for i in range（0 len（lists））:
        lists[i] = lists[i].replace（“\\“ ‘‘）
    try:
        os.mkdir（doc_id）
    except:
        pass
    for i in range（0 len（lists））:
        img = requests.get（lists[i]）.content
        with open（doc_id + ‘\img‘ + str（i） + ‘.jpg‘ ‘wb‘） as m:
            m.write（img）
    print（“PPT图片保存在“ + doc_id + “文件夹“）


def TXT（url）:
    doc_id = re.findall（‘view/（.*）.html‘ url）[0]
    url = “https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=“ + doc_id
    html = requests.get（url）.text
    md5 = re.findall（‘“md5sum“:“（.*?）“‘ html）[0]
    pn = re.findall（‘“totalPageNum“:“（.*?）“‘ html）[0]
    rsign = re.findall（‘“rsign“:“（.*?）“‘ html）[0]
    NewUrl = ‘https://wkretype.bdimg.com/retype/text/‘ + doc_id + ‘?rn=‘ + pn + ‘&type=txt‘ + md5 + ‘&rsign=‘ + rsign
    txt = requests.get（NewUrl）.text
    jsons = json.loads（txt）
    texts = re.findall（“‘c‘: ‘（.*?）‘“ str（jsons））
    print（texts）
    filename = doc_id + ‘.txt‘
    with open（filename ‘a‘ encoding=‘utf-8‘） as f:
        for i in range（0 len（texts））:
            texts[i] = texts[i].replace（‘\\r‘ ‘\r‘）
            texts[i] = texts[i].replace（‘\\n‘ ‘\n‘）

            f.write（texts[i]）
    print（“文档保存在“ + filename）


def FPD（url）:
    doc_id = re.findall（‘view/（.*）.html‘ url）[0]
    url = “https://wenku.baidu.com/browse/getbcsurl?doc_id=“ + doc_id + “&pn=1&rn=99999&type=ppt“
    html = requests.get（url）.text
    lists = re.findall（‘{“zoom“:“（.*?）“

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件        398  2018-05-30 09:36  文库爬虫\.idea\FreeForWenku-master.iml

     文件        212  2018-05-30 09:35  文库爬虫\.idea\misc.xml

     文件        290  2018-05-30 09:35  文库爬虫\.idea\modules.xml

     文件      14917  2018-05-30 10:03  文库爬虫\.idea\workspace.xml

     文件     141668  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img0.jpg

     文件     177485  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img1.jpg

     文件     211042  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img10.jpg

     文件     201024  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img11.jpg

     文件     257453  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img12.jpg

     文件     213538  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img13.jpg

     文件     228293  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img14.jpg

     文件     236833  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img15.jpg

     文件     213298  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img16.jpg

     文件     269043  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img17.jpg

     文件     231817  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img18.jpg

     文件     268082  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img19.jpg

     文件     206299  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img2.jpg

     文件     248085  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img3.jpg

     文件     246805  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img4.jpg

     文件     226442  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img5.jpg

     文件     257286  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img6.jpg

     文件     197320  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img7.jpg

     文件     226475  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img8.jpg

     文件     267267  2018-05-30 09:50  文库爬虫\09f3e7c9a1c7aa00b52acb96\img9.jpg

     文件      44871  2018-05-30 09:55  文库爬虫\695832bbd5d8d15abe23482fb4daa58da0111cc7.txt

     文件    7826677  2018-04-09 12:14  文库爬虫\FreeForWenku.exe

     文件       3752  2018-05-30 09:59  文库爬虫\FreeForWenku.py

     文件        575  2018-05-30 09:59  文库爬虫\README.md

     目录          0  2018-05-30 09:36  文库爬虫\.idea\inspectionProfiles

     目录          0  2018-05-30 10:03  文库爬虫\.idea

............此处省略5个文件信息

上一篇：Python-数学建模竞赛中所使用的相关算法的MATLAB实现
下一篇：python3+Django微博源代码和开发环境

共有条评论

百度文库PDF爬虫

资源简介

资源截图

代码片段和文件信息

评论

相关资源