资源简介
主题爬虫的完整实现,具有文章内容判重,主题相似度计算,url去重,通用正文抽取算法,网页内容分词,关键词自动抽取等功能。
代码片段和文件信息
# -*- coding:utf-8 -*-
class ContentExtract:
def __init__(self):
pass
def extract(self content):
# 块
block = []
lines = content.split(‘\n‘)
count = 0
str_temp = ‘‘
# 块长度
line_len = []
for line in lines:
str_temp = str_temp + line + “ “
count = count + 1
if count == 3:
str_temp = str_temp.replace(“\n“ ““)
str_temp = str_temp.replace(“\t“ ““)
str_temp = str_temp.replace(“\r“ ““)
str_temp = str_temp.replace(“ “ ““)
# 加入块
block.append(str_temp)
# 加入块长度
line_len.append(len(str_temp))
str_temp = ‘‘
count = 0
count_len = 0
leng = len(block)
# while count_len < leng:
# print count_len line_len[count_len] block[count_len]
# count_len = count_len + 1
count_start = 0
start_true = 0
end_true = 0
line_choice_true = []
content_true = ‘‘
start_temp = 0
end_temp = 0
line_choice_temp = []
content_temp = ‘‘
while count_start < len(block):
# 起始行的长度大于50,起始行的前一行要小于30,起始行的后一行要大于50,真正起始行要大于上一个的起始行
if line_len[count_start] > 50 and line_len[count_start-1] < 50 and count_start-1 not in line_choice_temp and (line_len[count_start+1] > 50 or line_len[count_start] > 300):
line_choice_temp = []
content_temp = ‘‘
start_temp = count_start
# print ‘22‘
line_choice_temp.append(count_start) # 将起始行加入选中的块号
content_temp = content_temp + block[start_temp] # 将起始行加入正文的字符串
# print “start:“ line_choice_temp
# 如果该块不是起始行,该块的长度小于5,该块的前一块在选中的块中,就将该块记作结束块
elif line_len[count_start] == 0 and count_start - 1 in line_choice_temp:
if line_len[count_start+1] < 5:
end_temp = count_start
# print “end:“ line_choice_temp
# print len(content_temp)
# print len(content_true)
# print line_len[count_start+1]
if len(content_temp) > len(content_true):
# for item in line_choice_temp:
# line_choice_true.append(item)
# print len(content_temp)
# print len(content_true)
line_choice_true = line_choice_temp
start_true = start_temp
end_true = end_temp
content_true = content_temp
else:
line_choice_temp.append(count_start)
content_temp = content_temp + block[count_start]
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2016-12-18 16:05 network_spider\
文件 3962 2016-12-03 23:52 network_spider\html_contentextract.py
文件 1748 2016-12-03 23:52 network_spider\html_contentextract.pyc
文件 1165 2016-12-03 23:52 network_spider\html_downloader.py
文件 1295 2016-12-03 23:52 network_spider\html_downloader.pyc
文件 953 2016-12-03 23:52 network_spider\html_manager.py
文件 1692 2016-12-03 23:52 network_spider\html_manager.pyc
文件 764 2016-12-03 23:52 network_spider\html_parser.py
文件 1250 2016-12-03 23:52 network_spider\html_parser.pyc
文件 124 2016-12-03 23:52 network_spider\keyword.txt
文件 7271 2016-12-03 23:52 network_spider\main.py
文件 854 2016-12-18 16:10 network_spider\mysql_manager.py
文件 1438 2016-12-03 23:52 network_spider\mysql_manager.pyc
文件 425 2016-12-04 14:41 network_spider\README.txt
文件 6057 2016-12-03 23:52 network_spider\stopword.txt
文件 2138 2016-12-03 23:52 network_spider\test.py
文件 3035 2016-12-03 23:52 network_spider\test_similarity.py
文件 3258 2016-12-03 23:52 network_spider\test_similarity.pyc
文件 186458 2016-12-03 23:52 network_spider\test_strstrip.html
文件 0 2016-12-03 23:52 network_spider\__init__.py
- 上一篇:Catia圆柱直斜齿轮生成插件
- 下一篇:sunlips一代软件
评论
共有 条评论