
# coding=utf-8
import sys
default_encoding = ‘utf-8‘
if sys.getdefaultencoding() != default_encoding:
import urllib2
import re
import random
import time
from bs4 import BeautifulSoup
import leancloud
from leancloud import object
from leancloud import LeanCloudError
WAIT_URL = None # 检测到如果有下一篇,则先保留该网址,等到遍历上一篇结束后,重新回来遍历下一篇
SEARCH_TYPE = 1 # 1为下一篇
class Get_First_Url:
def __init__(self url2):
self.url = url2
print(‘博客主页地址: ‘ + self.url)
user_agents = [
‘Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv: Gecko/20071127 Firefox/‘
‘Opera/9.25 (Windows NT 5.1; U; en)‘
‘Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)‘
‘Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)‘
‘Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko/20070731 Ubuntu/dapper-security Firefox/‘
‘Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9‘
“Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7“
“Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 “
agent = random.choice(user_agents)
req = urllib2.Request(self.url)
req.add_header(‘User-Agent‘ agent)
req.add_header(‘Host‘ ‘blog.csdn.net‘)
req.add_header(‘Accept‘ ‘*/*‘)
req.add_header(‘Referer‘ ‘http://blog.csdn.net/mangoer_ys?viewmode=list‘)
req.add_header(‘GET‘ url)
html = urllib2.urlopen(req)
page = html.read().decode(‘utf-8‘)
self.page = page
self.beginurl = self.getFirstUrl()
# 得到其博客主页的第一篇文章
def getFirstUrl(self):
bs = BeautifulSoup(self.page)
html_content_list = bs.find(‘span‘ class_=‘link_title‘)
self.type = 1
if (html_content_list == None):
html_content_list = bs.find(‘h3‘ class_=‘list_c_t‘) # 不同的主题
self.type = 2
if (html_content_list == None):
return “nourl“
return ‘http://blog.csdn.net‘ + html_content_list.a[‘href‘]
except Exception e:
return “nourl“
class CSDN_Blog_Spider:
def __init__(self url2 type):
self.url = url2
self.type = type
if type == 4:
global WAIT_URL
WAIT_URL = url2
print ‘已记录待爬下一篇地址‘ + url2
print(‘正在爬取网页地址: ‘ + self.url)
user_agents = [
‘Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv: Gecko/20071127 Firefox/‘
