• 大小: 6KB
    文件类型: .py
    金币: 2
    下载: 2 次
    发布日期: 2021-09-04
  • 语言: Python
  • 标签: python爬虫  

资源简介

爬取豆瓣电视剧天盛长歌影评,并去掉其中的停止词,生成词云,

资源截图

代码片段和文件信息

#coding=utf-8
import requests
from lxml import etree
import random
import pymysql
import jieba.analyse
import re
# from scipy.misc import imread
# from wordcloud import WordCloud
# from wordcloud import ImageColorGenerator
# import matplotlib.pyplot as plt
# from os import path
from PIL import ImageImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloudImageColorGenerator


def geturl(urlIP_pools):
    USER_AGENTS = [
        “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML like Gecko) Chrome/62.0.3202.94 Safari/537.36“
        “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/68.0.3440.106 Safari/537.36“
        “Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0“
        “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/55.0.2883.87 Safari/537.36“

    ]
    Agent_Value = random.choice(USER_AGENTS)
    headers = {
        “User-Agent“:Agent_Value 
        “Host“: “movie.douban.com“
        “Accept“:“text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webpimage/apng*/*;q=0.8“
    }

    try:
        ip_one = random.choice(IP_pools)
        print(ip_one)
        proxies1 = {‘http‘: “http://“ + ip_one}
        print(url)
        r = requests.get(url=url headers=headers proxies=proxies1 timeout=5)
        print(r.status_code)
        assert r.status_code == 200
        return etree.HTML(r.content)
    except:
        try:
            ip_one = random.choice(IP_pools)
            print(ip_one)
            proxies1 = {‘http‘: “http://“ + ip_one}
            print(url)
            r = requests.get(url=url headers=headers proxies=proxies1 timeout=5)
            print(r.status_code)
            assert r.status_code == 200
            return etree.HTML(r.content)
        except:
            try:
                ip_one = random.choice(IP_pools)
                print(ip_one)
                proxies1 = {‘http‘: “http://“ + ip_one}
                print(url)
                r = requests.get(url=url headers=headers proxies=proxies1 timeout=5)
                print(r.status_code)
                assert r.status_code == 200
                return etree.HTML(r.content)
            except:
                print(“**“*20+“出现错误!“+“**“*20)

def get_IP():
    con = pymysql.connect(host=‘192.168.0.136‘ user=‘root‘ passwd=‘oysm=K8cV6eldcv‘ db=‘lh‘ port=3306
                          charset=‘utf8‘)
    if con:
        print(“ok“)
        cur = con.cursor()
        if cur:
            sql_read = “select IPport from ip_pool where score  = %s “
            cur.execute(sql_read “T“)
            con.commit()
            lines = cur.fetchall()
            a_list = []
            for i in lines:
                li = i[0] + “:“ + i[1]
                # print(li)
                a_lis

评论

共有 条评论