python网络爬虫获取景点信息源码

大小: 6KB

文件类型: .py

金币: 1

下载: 0 次

发布日期: 2024-02-04
语言: Python
标签: python

高速下载

资源简介

python网络爬虫获取去哪儿网景点信息源码，获取的景点信息有'景点', '景点类别', '景点级别', '地点', '经度', '纬度', '开放时间', '景点介绍', '评论次数', '游客评分', '热度', '关键词', '图片路径'。内有详细注释。

资源截图

小图大图

代码片段和文件信息

# -!- coding: utf-8 -!-

from bs4 import BeautifulSoup
from urllib.parse import *
import urllib
import urllib.request
import re
import string
import time
import codecs

import csv

import jieba
import jieba.analyse
from optparse import OptionParser


hotnum=re.compile（r‘\d（\d）*‘）

def getHotNum（cNumgrade）:
    if （int（cNum）>=50000）:
        hotNum = 50+ 10 * float（grade）
    else:
        hotNum= int（cNum）/1000 + 10 * float（grade）
    return hotNum

#初始化Http请求
url_base =‘http://piao.qunar.com‘# ‘http://piao.qunar.com/ticket/list.htm?‘
user_agent = ‘Mozilla/5.0 （Windows NT 10.0; WOW64; rv:58.0）‘
header = {‘User-Agent‘: user_agent}

ak = ‘whSDgmRhKopIDFMCGxj21FcY611b6R9h‘

#将结果存入csv文件
csvfile = open（‘毕设测试数据.csv‘ ‘a+‘ encoding=‘utf-8‘ newline=‘‘）
writer = csv.writer（csvfile）
#writer.writerow（[‘景点‘ ‘景点类别‘ ‘景点级别‘ ‘地点‘ ‘经度‘ ‘纬度‘ ‘开放时间‘ ‘景点介绍‘ ‘评论次数‘ ‘游客评分‘ ‘热度‘ ‘关键词‘ ‘图片路径‘]）

#请求网页
pageIndex=38 #请求页序号
while True:  ####
    if pageIndex == 1:    #首页（默认）

        #url = ‘http://piao.qunar.com/ticket/list.htm?keyword=中国‘
        url = ‘http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9®ion=&from=mpl_search_suggest&subject=文化古迹&page=1‘
        url = quote（url safe=string.printable）  #编码 问题
    elif pageIndex>2:    #限制爬取页数
        break

    else: #第pageIndex页
        print（pageIndex）


        #url = ‘http://piao.qunar.com/ticket/list.htm?keyword=中国&page={0}‘.format（pageIndex）
        url = ‘http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9®ion=&from=mpl_search_suggest&subject=文化古迹&page={}‘.format（pageIndex）
        url = quote（url safe=string.printable）


    #使用urlib库请求网页pageCode
    request = urllib.request.Request（urlheaders = header）
    response = urllib.request.urlopen（request）
    html = response.read（）.decode（‘utf-8‘‘ignore‘）

    #构造soup对象
    soup = BeautifulSoup（html ‘html.parser‘）

    #获取该页所有的新闻链接
    a = soup.find_all（‘div‘‘result_list‘）#search-list
    #print （a）
    soup_news = BeautifulSoup（a.__str__（） ‘html.parser‘）
    #print（a.__str__（））
    links = soup_news.find_all（‘a‘‘sight_item_do‘）#（‘a‘）
    #print（links）
    #初始化结果数组和景点序号
    results = [] #保存景点结果（9个字段  景点名称，地点，景点开放时间，景点介绍  热度等）
    i = 0;  #第几个景点

    #遍历新闻链接列表
    for item in links:
        #构造景点详情页面链接
        href = links[0][‘href‘]
        #print（href）
        href = url_base + href[0:]
        links=links[1:]
        #print（links）
        #print（href）

        # 保存景点链接
        #results.append（href）

        #请求景点内容页面
        request = urllib.request.Request（href headers=header）
        response = urllib.request.urlopen（request）
        html = response.read（）.decode（‘utf-8‘）

        soup_content = BeautifulSoup（html ‘html.parser‘） #景点内容



        name= soup_content.find（‘span‘‘mp-description-name‘）.string
        results.append（name）  #保存景点名称
        results.append（‘文化古迹‘）

上一篇：用python的pyecharts模块绘制世界地图疫情配套资源
下一篇：图形识别与颜色识别工具

共有条评论

python网络爬虫获取景点信息源码

资源简介

资源截图

代码片段和文件信息

评论

相关资源