资源简介
目标:爬取学堂在线合作院校页面内容
网址:https://v1-www.xuetangx.com/partners
要求:爬取到合作院校的名称及该所院校在学堂在线开课的数量,将爬取到的数据保存到一个json文件中!例如:“清华大学,308”
交付内容:整个项目(rar或zip格式)!压缩包名要求为 "ID-作业序号"!
代码片段和文件信息
import logging
import re
from collections import namedtuple
from datetime import time
import six
from six.moves.urllib.parse import (ParseResult quote urlparse
urlunparse)
logger = logging.getLogger(__name__)
_Rule = namedtuple(‘Rule‘ [‘field‘ ‘value‘])
RequestRate = namedtuple(
‘RequestRate‘ [‘requests‘ ‘seconds‘ ‘start_time‘ ‘end_time‘])
_DISALLOW_DIRECTIVE = {‘disallow‘ ‘dissallow‘ ‘dissalow‘ ‘disalow‘ ‘diasllow‘ ‘disallaw‘}
_ALLOW_DIRECTIVE = {‘allow‘}
_USER_AGENT_DIRECTIVE = {‘user-agent‘ ‘useragent‘ ‘user agent‘}
_SITEMAP_DIRECTIVE = {‘sitemap‘ ‘sitemaps‘ ‘site-map‘}
_CRAWL_DELAY_DIRECTIVE = {‘crawl-delay‘ ‘crawl delay‘}
_REQUEST_RATE_DIRECTIVE = {‘request-rate‘ ‘request rate‘}
_HOST_DIRECTIVE = {‘host‘}
_WILDCARDS = {‘*‘ ‘$‘}
_HEX_DIGITS = set(‘0123456789ABCDEFabcdef‘)
__all__ = [‘RequestRate‘ ‘Protego‘]
def _is_valid_directive_field(field):
return any([field in _DISALLOW_DIRECTIVE
field in _ALLOW_DIRECTIVE
field in _USER_AGENT_DIRECTIVE
field in _SITEMAP_DIRECTIVE
field in _CRAWL_DELAY_DIRECTIVE
field in _REQUEST_RATE_DIRECTIVE
field in _HOST_DIRECTIVE])
def _enforce_path(pattern):
if pattern.startswith(‘/‘):
return pattern
return ‘/‘ + pattern
class _URLPattern(object):
“““Internal class which represents a URL pattern.“““
def __init__(self pattern):
self._pattern = pattern
self.priority = len(pattern)
self._contains_asterisk = ‘*‘ in self._pattern
self._contains_dollar = self._pattern.endswith(‘$‘)
if self._contains_asterisk:
self._pattern_before_asterisk = self._pattern[:self._pattern.find(‘*‘)]
elif self._contains_dollar:
self._pattern_before_dollar = self._pattern[:-1]
self._pattern_compiled = False
def match(self url):
“““Retun True if pattern matches the given URL otherwise return False.“““
# check if pattern is already compiled
if self._pattern_compiled:
return self._pattern.match(url)
if not self._contains_asterisk:
if not self._contains_dollar:
# answer directly for patterns without wildcards
return url.startswith(self._pattern)
# pattern only contains $ wildcard.
return url == self._pattern_before_dollar
if not url.startswith(self._pattern_before_asterisk):
return False
self._pattern = self._prepare_pattern_for_regex(self._pattern)
self._pattern = re.compile(self._pattern)
self._pattern_compiled = True
return self._pattern.match(url)
def _prepare_pattern_for_regex(self pattern):
“““Return equivalent regex pattern for the given URL pattern.“““
pattern = re.sub(r‘\*+‘ ‘*‘ pattern)
s = re.split(r‘(\*|\$$)‘ pattern)
for index substr in
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2020-05-19 10:51 学堂在线\
目录 0 2020-05-19 10:49 学堂在线\.idea\
目录 0 2020-05-19 10:49 学堂在线\.idea\inspectionProfiles\
文件 174 2020-05-19 10:49 学堂在线\.idea\inspectionProfiles\profiles_settings.xm
文件 200 2020-05-19 10:49 学堂在线\.idea\misc.xm
文件 283 2020-05-19 10:49 学堂在线\.idea\modules.xm
文件 4558 2020-05-19 10:49 学堂在线\.idea\workspace.xm
文件 570 2020-05-19 10:49 学堂在线\.idea\学堂在线.iml
目录 0 2020-05-19 10:46 学堂在线\xtzx\
文件 19740888 2020-05-19 10:51 学堂在线\xtzx.zip
目录 0 2020-05-19 10:54 学堂在线\xtzx\.idea\
目录 0 2020-05-09 16:50 学堂在线\xtzx\.idea\inspectionProfiles\
文件 174 2020-05-09 16:25 学堂在线\xtzx\.idea\inspectionProfiles\profiles_settings.xm
文件 195 2020-05-09 16:25 学堂在线\xtzx\.idea\misc.xm
文件 267 2020-05-09 16:25 学堂在线\xtzx\.idea\modules.xm
文件 6315 2020-05-19 10:54 学堂在线\xtzx\.idea\workspace.xm
文件 361 2020-05-09 16:25 学堂在线\xtzx\.idea\xtzx.iml
文件 11010 2020-05-19 10:39 学堂在线\xtzx\MyData.json
目录 0 2020-05-09 16:50 学堂在线\xtzx\venv\
目录 0 2020-05-09 16:25 学堂在线\xtzx\venv\Include\
目录 0 2020-05-09 16:50 学堂在线\xtzx\venv\Lib\
目录 0 2020-05-09 16:50 学堂在线\xtzx\venv\Lib\site-packages\
目录 0 2020-05-09 16:50 学堂在线\xtzx\venv\Lib\site-packages\attr\
目录 0 2020-05-09 16:50 学堂在线\xtzx\venv\Lib\site-packages\attrs-19.3.0.dist-info\
文件 4 2020-05-09 16:29 学堂在线\xtzx\venv\Lib\site-packages\attrs-19.3.0.dist-info\INSTALLER
文件 1082 2020-05-09 16:29 学堂在线\xtzx\venv\Lib\site-packages\attrs-19.3.0.dist-info\LICENSE
文件 9022 2020-05-09 16:29 学堂在线\xtzx\venv\Lib\site-packages\attrs-19.3.0.dist-info\me
文件 2184 2020-05-09 16:29 学堂在线\xtzx\venv\Lib\site-packages\attrs-19.3.0.dist-info\RECORD
文件 5 2020-05-09 16:29 学堂在线\xtzx\venv\Lib\site-packages\attrs-19.3.0.dist-info\top_level.txt
文件 110 2020-05-09 16:29 学堂在线\xtzx\venv\Lib\site-packages\attrs-19.3.0.dist-info\WHEEL
文件 2141 2020-05-09 16:29 学堂在线\xtzx\venv\Lib\site-packages\attr\converters.py
............此处省略4037个文件信息
评论
共有 条评论