资源简介
python 爬虫爬取站长之站的模板,需要的看一看,毕业了,需要模板
代码片段和文件信息
# -*- conding:UTF-8 -*-
import requests
import re
class Resume(object):
def __init__(self):
self.headers = {
“User-Agent“: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/67.0.3396.99 Safari/537.36“
}
self.next_url_list = []
self.Download_list = []
def get_page(self url):
response = requests.get(url=url headers=self.headers)
if response.status_code == 200:
print(“请求成功“)
html = response.text
next_urls = re.findall(‘\s‘ html)
for next_url in next_urls:
self.next_url_list.append(next_url)
def parse_page(self next_url):
response = requests.get(url=next_url headers=self.headers)
response.encoding = “UTF-8“ # 转码为中文
if response.status_code == 200:
print(“请求成功“)
html = response.text
Download_url = re.findall(“福建电信下载 “ html)
name = re.findall(‘title“: “(.*?)“‘ html)
print(‘----------------------------------------------------‘)
print(Download_url)
print(name)
print(‘----------------------------------------------------‘)
self.Download_list.append([Download_url[0] name[0]])
print(Download_url name)
def Download(self download_url name):
response = requests.get(url=download_url headers=self.headers)
if response.status_code == 200:
with open(“test/%s.rar“ % name ‘wb‘) as f:
f.write(response.content)
def main(self):
for page in range(2 3):
if page == 1:
url = “http://sc.chinaz.com/jianli/free.html“
else:
url = “http://sc.chinaz.com/jianli/free_%s.html“ % page
print(url)
self.get_page(url)
for next_url in self.next_url_list:
self.parse_page(next_url)
for Download_info in self.Download_list:
self.Download(Download_info[0] Download_info[1])
if __name__ == “__main__“:
resume = Resume()
resume.main()
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 240993 2016-01-13 14:44 jianli8782\1.jpg
----------- --------- ---------- ----- ----
240993 1
- 上一篇:基于python的数据分析论文集
- 下一篇:电影推荐系统
评论
共有 条评论