资源简介
仅仅只是教学源码,并非专业爬虫
所用到库,requests,bs4,re,json,time,os
代码可运行,可适当根据自身情况调整爬取频率,即 time.sleep() 中的间隔时间
代码片段和文件信息
from bs4 import BeautifulSoup
import requests
import os
import re
import json
import time
start_date = ‘20190316‘
end_date = ‘20190330‘
days_list1 = [312831303130313130313031] # 平年每月的天数列表
days_list2 = [312931303130313130313031]
turn = 0
total_length = 187935
def UAPool():
global turn # 用于切换用户代理
agents = [‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/80.0.3987.116 Safari/537.36‘
‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/75.0.3770.100 Safari/537.36‘
‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE‘
‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363‘
‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; ServiceUI 14) AppleWebKit/537.36 (KHTML like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363‘
‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/78.0.3904.70 Safari/537.36‘
‘Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML like Gecko) Version/5.1 Safari/534.50‘
‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Firefox 4.0.1 – Windows‘
‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML like Gecko) Version/5.1 Safari/534.50‘
‘Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11 Chrome 17.0 – MAC‘
‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML like Gecko) Chrome/17.0.963.56 Safari/535.11‘
]
turn += 1
return {‘User-Agent‘:agents[turn % 11]}
def CreateDate(start_date end_date): # 制作日期列表避免出现跨月份或者跨年份的日期使得url错误
print(‘CreateDate开始‘)
date_list = [] # 存放最终日期列表
days_list = []
year1 = int(start_date[:4])
month1 = int(start_date[4:6])
day1 = int(start_date[6:])
year2 = int(end_date[:4])
month2 = int(end_date[4:6])
day2 = int(end_date[6:])
for i in range(year1 year2 + 1):
if (i % 4 == 0 and i % 100 != 0) or i % 400 == 0: # 闰年
if year2 > year1 and i == year1:
days_list.append(days_list2[month1 - 1:])
elif year2 == year1:
days_list.append(days_list2[month1 - 1:month2])
elif year2 > year1 and i == year2:
days_list.append(days_list2[:month2])
else:
days_list.append(days_list2[::])
else:
if year2 > year1 and i == year1:
days_list.append(days_list1[month1 - 1:])
elif year2 == year1:
days_list.append(days_list1[month1 - 1:month2])
elif year2 > year1 and
- 上一篇:网易云课堂:21天搞定Python分布式爬虫
- 下一篇:树莓派LoRa教程.docx
评论
共有 条评论