资源简介

使用python3.7+scrapy+mongodb框架爬取新浪微博的用户信息以及微博动态 实现了高匿ip代理池、伪装请求User-Agent 教程贴请移步:https://blog.csdn.net/mengyanyuan8023/article/details/94017903

资源截图

代码片段和文件信息

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy.loader.processors import TakeFirst


class SinaUserItem(scrapy.Item):
    # 微博用户唯一标识
    user_id = scrapy.Field(output_processor=TakeFirst())
    # 用户昵称
    username = scrapy.Field(output_processor=TakeFirst())
    # 微博数量
    webo_num = scrapy.Field(output_processor=TakeFirst())
    # 关注人数
    follow_num = scrapy.Field(output_processor=TakeFirst())
    # 粉丝人数
    fans_num = scrapy.Field(output_processor=TakeFirst())
    # 性别
    gender = scrapy.Field(output_processor=TakeFirst())
    # 地区
    district = scrapy.Field(output_processor=TakeFirst())
    # 省份
    province = scrapy.Field(output_processor=TakeFirst())
    # 地市
    city = scrapy.Field(output_processor=TakeFirst())
    # 生日
    birthday = scrapy.Field(output_processor=TakeFirst())
    # 简介
    brief_intro = scrapy.Field(output_processor=TakeFirst())
    # 认证
    identify = scrapy.Field(output_processor=TakeFirst())
    # 电脑板
    # internet_url = scrapy.Field(output_processor=TakeFirst())
    # 手机版
    # mobile_url = scrapy.Field(output_processor=TakeFirst())
    # 头像 URL
    head_img = scrapy.Field(output_processor=TakeFirst())
    # 标签
    # tag = scrapy.Field()

    # 爬取时间
    crawl_time = scrapy.Field(output_processor=TakeFirst())


class WeiBoContentItem(scrapy.Item):
    # 用户 id
    user_id = scrapy.Field(output_processor=TakeFirst())
    # 微博 id
    weibo_id = scrapy.Field(output_processor=TakeFirst())
    # 权限
    # authority = scrapy.Field(output_processor=TakeFirst())
    # 微博内容
    weibo_content = scrapy.Field(output_processor=TakeFirst())
    # 微博图片
    weibo_images = scrapy.Field()
    # 微博图片保存本地路径
    # images_path = scrapy.Field()
    # 类型(原创/转载)
    weibo_type = scrapy.Field(output_processor=TakeFirst())
    # 发布时间
    post_time = scrapy.Field(output_processor=TakeFirst())
    # 点赞数
    like_count = scrapy.Field(output_processor=TakeFirst())
    # 评论数
    comment_count = scrapy.Field(output_processor=TakeFirst())
    # 转发数
    retweet_count = scrapy.Field(output_processor=TakeFirst())
    # 发布终端
    terminal = scrapy.Field(output_processor=TakeFirst())


if __name__ == ‘__main__‘:
    item = WeiBoContentItem()
    print(item.get(‘weibo_images‘))

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件         90  2019-06-28 16:24  sina_crawl\.git\COMMIT_EDITMSG

     文件        311  2019-06-19 18:11  sina_crawl\.git\config

     文件         73  2019-06-13 16:42  sina_crawl\.git\description

     文件        102  2019-06-18 15:32  sina_crawl\.git\FETCH_HEAD

     文件         25  2019-06-19 18:09  sina_crawl\.git\HEAD

     文件        478  2019-06-13 16:42  sina_crawl\.git\hooks\applypatch-msg.sample

     文件        896  2019-06-13 16:42  sina_crawl\.git\hooks\commit-msg.sample

     文件       3327  2019-06-13 16:42  sina_crawl\.git\hooks\fsmonitor-watchman.sample

     文件        189  2019-06-13 16:42  sina_crawl\.git\hooks\post-update.sample

     文件        424  2019-06-13 16:42  sina_crawl\.git\hooks\pre-applypatch.sample

     文件       1638  2019-06-13 16:42  sina_crawl\.git\hooks\pre-commit.sample

     文件       1348  2019-06-13 16:42  sina_crawl\.git\hooks\pre-push.sample

     文件       4898  2019-06-13 16:42  sina_crawl\.git\hooks\pre-rebase.sample

     文件        544  2019-06-13 16:42  sina_crawl\.git\hooks\pre-receive.sample

     文件       1492  2019-06-13 16:42  sina_crawl\.git\hooks\prepare-commit-msg.sample

     文件       3610  2019-06-13 16:42  sina_crawl\.git\hooks\update.sample

     文件       3052  2019-06-28 16:24  sina_crawl\.git\index

     文件        240  2019-06-13 16:42  sina_crawl\.git\info\exclude

     文件       3602  2019-06-28 16:24  sina_crawl\.git\logs\HEAD

     文件       2383  2019-06-18 17:51  sina_crawl\.git\logs\refs\heads\master

     文件       1056  2019-06-28 16:24  sina_crawl\.git\logs\refs\heads\V1.0.619

     文件       2186  2019-06-18 17:51  sina_crawl\.git\logs\refs\remotes\origin\master

     文件        564  2019-06-28 16:24  sina_crawl\.git\logs\refs\remotes\origin\V1.0.619

     文件       1353  2019-06-18 17:51  sina_crawl\.git\objects\03\ba28b053d3d5240e547e4b4b2e085059f828cb

     文件       4208  2019-06-19 18:11  sina_crawl\.git\objects\04\3b0786233c2d8f36c77c4f2e010104e3eadeb1

     文件       4133  2019-06-18 15:36  sina_crawl\.git\objects\07\8f1dcdae600525ca47a610de0b2f1bb0aec835

     文件       4091  2019-06-28 16:24  sina_crawl\.git\objects\09\33f7f23e6f4eb016140a10b7bd04c85d3abc73

     文件       1943  2019-06-28 16:22  sina_crawl\.git\objects\0a\23b39cdfdfb9970825a275bff8bfe7be964285

     文件        228  2019-06-14 17:52  sina_crawl\.git\objects\0a\b84e0fced0f1e0528e97cfea25e0aa9a1870d1

     文件       1931  2019-06-28 16:22  sina_crawl\.git\objects\0b\bdbbf14ec7094db5e2c013380bb55be9f9eee7

............此处省略464个文件信息

评论

共有 条评论