资源简介
python + selenium +pyquery 爬虫 爬取 1688详情图片 阿里巴巴详情图片 与标题 下载图片并进行压缩 仅供学习交流使用
代码片段和文件信息
# -*- coding: utf-8 -*
import datetime
import os
import random
import re
import time
import Image
import requests
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.webdriver import ActionChains
from selenium.webdriver.common import keys
parentPath =“/Users/niubilea/Documents/ag/ali_goods/%s/“
downloadPath = parentPath+“download“
compressPath = parentPath+“compress“
def bluePrint(str):
print(‘\033[1;34m‘ + str + ‘\033[0m‘)
def redPrint(str):
print(‘ \033[1;31;40m‘ + str + ‘\033[0m‘)
def openUrl(url):
browser = webdriver.Chrome(“./chromedriver_mac_64“)
browser.get(url)
top = 1000;
distance=100;
for i in range (130):
print(i)
top=top+i*distance
js=“var q=document.documentElement.scrollTop=“+str(top)
browser.execute_script(js)
time.sleep(random.random())
time.sleep(3)
return browserbrowser.page_source
def getPageHtml(pageUrl):
print(“开始获取html内容“)
headers = {‘Content-type‘: ‘text/html‘
‘User-Agent‘: ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100201 Firefox/22.0‘}
content = requests.get(pageUrltimeout=30headers=headers)
# content.encoding =‘utf-8‘;
content.encoding =‘gbk‘;
htmlsub = content.text
print(“获取内容完成“)
return htmlsub;
#创建文件
#file_path:文件路径
#msg:即要写入的内容
def create__file(file_pathmsg):
f=open(file_path“w“)
f.write(msg)
f.close
def download_banner(titlecontentdownloadFoldercompressFolder):
pic_url = re.findall(‘src=“(https://cbu01.*jpg“)‘ content)
i = 0;
for key in pic_url:
time.sleep(0.3)
i = i + 1
temptitle = title + str(i)
targetImgPath = downloadFolder + ‘/%s.jpg‘ % temptitle
print(key + “\r\n“)
from urllib import urlretrieve
try:
if key.find(“https“) >= 0:
urlretrieve(key targetImgPath)
else:
urlretrieve(“https:“ + key targetImgPath)
except Exception as e:
print(e)
tinypng(downloadFoldercompressFolder)
def download_content(titlecontentdownloadFoldercompressFolder):
pic_url = re.findall(‘img src=“(.*?)“‘ content)
i = 0;
for key in pic_url:
time.sleep(0.03)
i = i + 1
temptitle = title + str(i)
targetImgPath = downloadFolder + ‘/%s.jpg‘ % temptitle
print(key + “\r\n“)
from urllib import urlretrieve
try:
if key.find(“https“) >= 0:
urlretrieve(key targetImgPath)
else:
urlretrieve(“https:“ + key targetImgPath)
except Exception as e:
print(e)
tinypng(downloadFoldercompressFolder)
def tinypng(downloadcompress):
# 指定要压缩的文件夹
srcPath =download
# 压缩后文件夹
dstPath = compress
for filename in os.listdir(srcPath):
# 如果不存在目的目录则创建一个,保持层级结构
if not os.path.exists(dstPath):
os.makedirs(dstPath)
# 拼接完整的文件或文
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 15275600 2019-07-15 19:49 chromedriver_mac_64
文件 8543232 2019-07-15 20:29 chromedriver.exe
文件 11061936 2019-07-16 10:09 chromedriver_linux64
文件 5546 2019-08-17 23:12 seleniumDemo_back.py
相关资源
- Python编程快速上手——让繁琐工作自
- Python高性能编程高清版本
- Packt.Python.Artificial.Intelligence.Projects.
- 笨办法学Python 3 (Learn Python 3 the har
- Python高效开发实战Django+Tornado+Flask+T
- 深度学习入门:基于Python的理论与实
- python学习手册 第5版 pdf
- 深度学习入门:基于Python的理论与实
- Pandas 0.19.2 官方文档 汉化中文版(精
- python276_bin.zip
- 如何使用python做数据的预测和分析的
- 《Python进阶》中文版 pdf
- python计算机视觉编程完整版.rar
- Python深度学习中文版基于tensorflow实现
- python数据挖掘入门与实践.zip
- python2.6.4.msi WIN32安装包
- [书签+文字版]Python Machine Learning 2nd
- 《Python编程快速上手》英文原版
- Python游戏编程快速上手.rar
- 脉冲神经网络Python可运行
- matlab和python的神经网络
- wxPython3.0-win32-3.0.2.0-py2732位系统安装包
- Python核心编程第3版PDF高清晰完整中文
- python dlib 训练人脸特征点检测器
- Python数据处理
- Python从入门到实践--完整版本(高清)
- python基础知识笔记总结
- Francois Chollet-Deep Learning with Python2528
- python实现CNN中文文本分类
- Deep learning with Python Francois Chollet
评论
共有 条评论