资源简介
和C++版本的思路一样,可以制定页数进行爬取百度搜索结果页面的信息
代码片段和文件信息
#!/usr/bin
#coding:utf-8
import sys
import urllib
import urllib2
import re
class FetchUrl:
“““This a BaiduCrawler for get subUrl of PageContent“““
def __init__(self strKeyword iPages = 1):
‘‘‘Some Inition‘‘‘
self.m_strKeyword = strKeyword
self.m_iPages = iPages
def GetSubPageUrlList(self url comreg):
‘‘‘Fetch subUrl of Pages‘‘‘
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError e:
print “******Get A HTTPError Try again*****“
response = urllib2.urlopen(url)
except urllib2.URLError e:
print “******Get An URLError Try again*****“
response = urllib2.urlopen(url)
htmlpage = response.read()
infoList1 = re.findall(comreg htmlpage)
#将列表去重之后返回
return list(set(infoList1))
def GetUrlList(self):
‘‘‘获取结果页面中指定页数的子链接‘‘‘
mainList = [];
reg = r‘http://www.baidu.com/link\?url=.[^\“]+‘
comreg = re.compile(reg)
print “任务的关键词为:%s“ % self.m_strKeyword
#将关键词进行url编码
encodeKeyword = urllib.quote(self.m_strKeyword.decode(‘gbk‘).encode(‘utf-8‘))
i = 1
while i <= self.m_iPages:
url = ‘http://www.baidu.com/s?wd=%s&pn=%d&tn=baiduhome_pg&ie=utf-8&usm=4‘ % (encodeKeyword i)
subList = self
评论
共有 条评论