资源简介
目标:爬取链家官方网站新房的数据(3-5页即可,太多可能被封禁ip)
网址:https://bj.fang.lianjia.com/loupan/
要求:将楼盘名称、价格、平米数等(可以拓展)数据保存到一个json文件中。
交付:整个project的压缩包(rar或zip格式)。压缩包名要求为 "ID-作业序号"!
我的答案
代码片段和文件信息
import logging
import re
from collections import namedtuple
from datetime import time
import six
from six.moves.urllib.parse import (ParseResult quote urlparse
urlunparse)
logger = logging.getLogger(__name__)
_Rule = namedtuple(‘Rule‘ [‘field‘ ‘value‘])
RequestRate = namedtuple(
‘RequestRate‘ [‘requests‘ ‘seconds‘ ‘start_time‘ ‘end_time‘])
_DISALLOW_DIRECTIVE = {‘disallow‘ ‘dissallow‘ ‘dissalow‘ ‘disalow‘ ‘diasllow‘ ‘disallaw‘}
_ALLOW_DIRECTIVE = {‘allow‘}
_USER_AGENT_DIRECTIVE = {‘user-agent‘ ‘useragent‘ ‘user agent‘}
_SITEMAP_DIRECTIVE = {‘sitemap‘ ‘sitemaps‘ ‘site-map‘}
_CRAWL_DELAY_DIRECTIVE = {‘crawl-delay‘ ‘crawl delay‘}
_REQUEST_RATE_DIRECTIVE = {‘request-rate‘ ‘request rate‘}
_HOST_DIRECTIVE = {‘host‘}
_WILDCARDS = {‘*‘ ‘$‘}
_HEX_DIGITS = set(‘0123456789ABCDEFabcdef‘)
__all__ = [‘RequestRate‘ ‘Protego‘]
def _is_valid_directive_field(field):
return any([field in _DISALLOW_DIRECTIVE
field in _ALLOW_DIRECTIVE
field in _USER_AGENT_DIRECTIVE
field in _SITEMAP_DIRECTIVE
field in _CRAWL_DELAY_DIRECTIVE
field in _REQUEST_RATE_DIRECTIVE
field in _HOST_DIRECTIVE])
def _enforce_path(pattern):
if pattern.startswith(‘/‘):
return pattern
return ‘/‘ + pattern
class _URLPattern(object):
“““Internal class which represents a URL pattern.“““
def __init__(self pattern):
self._pattern = pattern
self.priority = len(pattern)
self._contains_asterisk = ‘*‘ in self._pattern
self._contains_dollar = self._pattern.endswith(‘$‘)
if self._contains_asterisk:
self._pattern_before_asterisk = self._pattern[:self._pattern.find(‘*‘)]
elif self._contains_dollar:
self._pattern_before_dollar = self._pattern[:-1]
self._pattern_compiled = False
def match(self url):
“““Retun True if pattern matches the given URL otherwise return False.“““
# check if pattern is already compiled
if self._pattern_compiled:
return self._pattern.match(url)
if not self._contains_asterisk:
if not self._contains_dollar:
# answer directly for patterns without wildcards
return url.startswith(self._pattern)
# pattern only contains $ wildcard.
return url == self._pattern_before_dollar
if not url.startswith(self._pattern_before_asterisk):
return False
self._pattern = self._prepare_pattern_for_regex(self._pattern)
self._pattern = re.compile(self._pattern)
self._pattern_compiled = True
return self._pattern.match(url)
def _prepare_pattern_for_regex(self pattern):
“““Return equivalent regex pattern for the given URL pattern.“““
pattern = re.sub(r‘\*+‘ ‘*‘ pattern)
s = re.split(r‘(\*|\$$)‘ pattern)
for index substr in
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2020-05-19 11:01 lianjia\
目录 0 2020-05-19 11:02 lianjia\.idea\
目录 0 2020-05-09 16:17 lianjia\.idea\inspectionProfiles\
文件 174 2020-05-09 11:03 lianjia\.idea\inspectionProfiles\profiles_settings.xm
文件 361 2020-05-09 11:03 lianjia\.idea\lianjia.iml
文件 198 2020-05-09 11:03 lianjia\.idea\misc.xm
文件 273 2020-05-09 11:03 lianjia\.idea\modules.xm
文件 6342 2020-05-19 11:02 lianjia\.idea\workspace.xm
文件 17790 2020-05-19 10:59 lianjia\MyData.json
目录 0 2020-05-09 16:19 lianjia\venv\
目录 0 2020-05-09 11:02 lianjia\venv\Include\
目录 0 2020-05-09 16:17 lianjia\venv\Lib\
目录 0 2020-05-09 16:19 lianjia\venv\Lib\site-packages\
目录 0 2020-05-09 16:17 lianjia\venv\Lib\site-packages\attr\
目录 0 2020-05-09 16:17 lianjia\venv\Lib\site-packages\attrs-19.3.0.dist-info\
文件 4 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attrs-19.3.0.dist-info\INSTALLER
文件 1082 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attrs-19.3.0.dist-info\LICENSE
文件 9022 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attrs-19.3.0.dist-info\me
文件 2184 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attrs-19.3.0.dist-info\RECORD
文件 5 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attrs-19.3.0.dist-info\top_level.txt
文件 110 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attrs-19.3.0.dist-info\WHEEL
文件 2141 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\converters.py
文件 351 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\converters.pyi
文件 1635 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\exceptions.py
文件 458 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\exceptions.pyi
文件 1098 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\filters.py
文件 214 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\filters.pyi
文件 0 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\py.typed
文件 11460 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\validators.py
文件 1868 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\validators.pyi
文件 7326 2020-05-09 11:19 lianjia\venv\Lib\site-packages\attr\_compat.py
............此处省略4028个文件信息
评论
共有 条评论