• 大小: 8KB
    文件类型: .py
    金币: 2
    下载: 1 次
    发布日期: 2021-06-17
  • 语言: Python
  • 标签: python  今日头条  

资源简介

爬取今日头条列表以及今日头条详情内容并存储到数据库。

资源截图

代码片段和文件信息

# -*- coding:utf-8 -*-
#设置utf-8编码
import os
import requests
from urllib.parse import urlencode
from hashlib import md5
from multiprocessing.pool import Pool
import time
import pymysql
import random
GROUP_START = 1
GROUP_END = 20

#爬取今日头条列表页
#由于今日头条爬取频繁会封ip  推荐使用 牛魔ip代理或者太阳代理 等自动切换代理ip的软件
def get_page(offsetkeyword):
    params = {
        ‘offset‘: offset
        ‘format‘: ‘json‘
        ‘keyword‘: keyword
        ‘autoload‘: ‘true‘
        ‘count‘: ‘20‘
        ‘cur_tab‘: ‘1‘
        ‘from‘: ‘news‘
    }
    url = ‘https://www.toutiao.com/search_content/?‘ + urlencode(params)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
    except requests.Connectionerror:
        return None


def get_images(json):
    data = json.get(‘data‘)
    if data:
        for item in data:
            # print(item)
            image_list = item.get(‘image_list‘)
            title = item.get(‘title‘)
            media_name = item.get(‘media_name‘);
            datetime = item.get(‘datetime‘);
            image01=““;
            image02=““;
            image03=““;
            tag_id =str(item.get(‘tag_id‘));
            # print(image_list)
            if image_list:
                for image in image_list:
                    # len 判断是否为空
                    if len(image_list)==1:
                        image01 = image.get(‘url‘);
                    if len(image_list)==2:
                        image01 = image.get(‘url‘);
                        image02 = image.get(‘url‘);
                    if len(image_list)==3:
                        image01 = image.get(‘url‘);
                        image02 = image.get(‘url‘);
                        image03 = image.get(‘url‘);

            yield {


                ‘title‘: title
                ‘media_name‘: media_name
                ‘datetime‘: datetime
                ‘image01‘: image01
                ‘image02‘: image02
                ‘image03‘: image03
                ‘tag_id‘:tag_id
            }


#保存图片到本地
def save_image(item):
    if not os.path.exists(item.get(‘title‘)):
        os.mkdir(item.get(‘title‘))
    try:
        local_image_url = item.get(‘image‘)
        new_image_url = local_image_url.replace(‘list‘‘large‘)
        response = requests.get(‘http:‘ + new_image_url)
        if response.status_code == 200:
            file_path = ‘{0}/{1}.{2}‘.format(item.get(‘title‘) md5(response.content).hexdigest() ‘jpg‘)
            if not os.path.exists(file_path):
                with open(file_path ‘wb‘)as f:
                    f.write(response.content)
            else:
                print(‘Already Downloaded‘ file_path)
    except requests.Connectionerror:
        print(‘Failed to save image‘)


def main(offset):
    # 创建连接
    conn = pymysql.connect(host=‘127.0.0.1‘ port=3306 user=‘root‘ passwd=‘123456‘ db=‘today_news‘ c

评论

共有 条评论