import requests from bs4 import BeautifulSoup from lxml import etree from urllib import request from urllib.parse import quote urlencode import csv import copy import re def getDataByUrl2 (url): #headers = {‘User-Agent‘: ‘Mozilla/5 .0 (Windows NT 6 .1 ; WOW64 ; rv:23 .0 ) Gecko/20100101 Firefox /23 .0 ‘} headers = {‘authority‘: ‘search.jd.com‘ ‘method‘: ‘GET‘ ‘path‘: ‘/s_new.php?keyword=%E6 %89 %8 B%E6 %9 C%BA&enc=utf-8 &qrst=1 &rt=1 &stop=1 &vt=2 &wq=%E6 %89 %8 B%E6 %9 C%BA&cid2 =653 &cid3 =655 &page=4 &s=84 &scrolling=y&log_id=1529828108 .22071 &tpl=3 _M&show_items=76519277367120705686874192526001239593418245549693893501742146265774952648054355373457574483120617607769327957336429596306652833872572246889274256224768461 ‘ ‘scheme‘: ‘https‘ ‘referer‘: ‘https://search.jd.com/Search?keyword=%E6 %89 %8 B%E6 %9 C%BA&enc=utf-8 &qrst=1 &rt=1 &stop=1 &vt=2 &wq=%E6 %89 %8 B%E6 %9 C%BA&cid2 =653 &cid3 =655 &page=3 &s=58 &click=0 ‘ ‘user-agent‘: ‘Mozilla/5 .0 (Windows NT 6 .1 ; Win64 ; x64 ) AppleWebKit/537 .36 (KHTML like Gecko) Chrome/66 .0 .3359 .139 Safari/537 .36 ‘ ‘x-requested-with‘: ‘xmlHttpRequest‘ ‘Cookie‘:‘qrsc=3 ; pinId=RAGa4 xMoVrs; xtest=1210 .cf6 b6759 ; ipLocation=%u5 E7 F%u4 E1 C; _jrda=5 ; TrackID=1 aUdbc9 HHS2 MdEzabuYEyED1 iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9 Vj9 _2 wUakxuSLAO9 AFtB2 U0 SsAD-mXIh5 rIfuDiSHSNhZcsJvg; shshshfpa=17943 c91 -d534 -104 f-a035 -6 e1719740 bb6 -1525571955 ; shshshfpb=2 f200 f7 c5265 e4 af999 b95 b20 d90 e6618559 f7251020 a80 ea1 aee61500 ; cn=0 ; 3 AB9 D23 F7 A4 B3 C9 B=QFOFIDQSIC7 TZDQ7 U4 RPNYNFQN7 S26 SFCQQGTC3 YU5 UZQJZUBNPEXMX7 O3 R7 SIRBTTJ72 AXC4 S3 IJ46 ESBLTNHD37 U; ipLoc-djd=19 -1607 -3638 -3638 .608841570 ; __jdu=930036140 ; user-key=31 a7628 c-a9 b2 -44 b0 -8147 -f10 a9 e597 d6 f; areaId=19 ; __jdv=122270672 |direct|-|none|-|1529893590075 ; PCSYCityID=25 ; mt_xid=V2 _52007 VwsQU1 xaVVoaSClUA2 YLEAdbWk5 YSk9 MQAA0 BBZOVQ0 ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5 eXENaGkIZWg5 nAyJQbVhiWR9 BGlUNZwoWYl1 dVF0 %3 D; __jdc=122270672 ; shshshfp=72 ec41 b59960 ea9 a26956307465948 f6 ; rkv=V0700 ; __jda=122270672 .930036140 .-.1529979524 .1529984840 .85 ; __jdb=122270672 .1 .930036140 |85 .1529984840 ; shshshsID=f797 fbad20 f4 e576 e9 c30 d1 c381 ecbb1 _1 _1529984840145 ‘ } req = request.Request(url=url headers=headers) pageSource = request.urlopen(req).read().decode(‘utf-8 ‘ errors=‘ignore‘) with open(‘data.txt‘ ‘w‘) as f: f.write(pageSource) soup = BeautifulSoup(pageSource ‘lxml‘) return soup def getDataByUrl(url): #headers = {‘User-Agent‘: ‘Mozilla/5 .0 (Windows NT 6 .1 ; WOW64 ; rv:23 .0 ) Gecko/20100101 Firefox /23 .0 ‘} headers = {‘authority‘: ‘search.jd.com‘ ‘method‘: ‘GET‘ ‘path‘: ‘/s_new.php?keyword=%E6 %89 %8 B%E6 %9 C%BA&enc=utf-8 &qrst=1 &rt=1 &stop=1 &vt=2 &wq=%E6 %89 %8 B%E6 %9 C%BA&cid2 =653 &cid3 =655 &page=4 &s=84 &scrolling=y&log_id=1529828108 .22071 &tpl=3 _M&show_items=765192773671207056868741925260012395934182455496938935017421462657749526480543553734575744831206176077693279573364295963066528338725722468892742562
评论
共有 条评论