资源简介

主题爬虫的完整实现,具有文章内容判重,主题相似度计算,url去重,通用正文抽取算法,网页内容分词,关键词自动抽取等功能。

资源截图

代码片段和文件信息

# -*- coding:utf-8 -*-


class ContentExtract:
    def __init__(self):
        pass

    def extract(self content):
        # 块
        block = []
        lines = content.split(‘\n‘)
        count = 0
        str_temp = ‘‘
        # 块长度
        line_len = []
        for line in lines:
            str_temp = str_temp + line + “ “
            count = count + 1
            if count == 3:
                str_temp = str_temp.replace(“\n“ ““)
                str_temp = str_temp.replace(“\t“ ““)
                str_temp = str_temp.replace(“\r“ ““)
                str_temp = str_temp.replace(“ “ ““)
                # 加入块
                block.append(str_temp)
                # 加入块长度
                line_len.append(len(str_temp))
                str_temp = ‘‘
                count = 0

        count_len = 0
        leng = len(block)
        # while count_len < leng:
        #     print count_len line_len[count_len] block[count_len]
        #     count_len = count_len + 1

        count_start = 0

        start_true = 0
        end_true = 0
        line_choice_true = []
        content_true = ‘‘

        start_temp = 0
        end_temp = 0
        line_choice_temp = []
        content_temp = ‘‘

        while count_start < len(block):
            # 起始行的长度大于50,起始行的前一行要小于30,起始行的后一行要大于50,真正起始行要大于上一个的起始行
            if line_len[count_start] > 50  and line_len[count_start-1] < 50 and count_start-1 not in line_choice_temp and (line_len[count_start+1] > 50 or line_len[count_start] > 300):
                line_choice_temp = []
                content_temp = ‘‘
                start_temp = count_start
                # print ‘22‘
                line_choice_temp.append(count_start)  # 将起始行加入选中的块号
                content_temp = content_temp + block[start_temp]  # 将起始行加入正文的字符串
                # print “start:“ line_choice_temp
            # 如果该块不是起始行,该块的长度小于5,该块的前一块在选中的块中,就将该块记作结束块
            elif line_len[count_start] == 0 and count_start - 1 in line_choice_temp:
                if line_len[count_start+1] < 5:
                    end_temp = count_start
                    # print “end:“ line_choice_temp
                    # print len(content_temp)
                    # print len(content_true)
                    # print line_len[count_start+1]
                    if len(content_temp) > len(content_true):
                        # for item in line_choice_temp:
                        #     line_choice_true.append(item)
                        # print len(content_temp)
                        # print len(content_true)
                        line_choice_true = line_choice_temp
                        start_true = start_temp
                        end_true = end_temp
                        content_true = content_temp
                else:
                    line_choice_temp.append(count_start)
                    content_temp = content_temp + block[count_start]
                  

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2016-12-18 16:05  network_spider\
     文件        3962  2016-12-03 23:52  network_spider\html_contentextract.py
     文件        1748  2016-12-03 23:52  network_spider\html_contentextract.pyc
     文件        1165  2016-12-03 23:52  network_spider\html_downloader.py
     文件        1295  2016-12-03 23:52  network_spider\html_downloader.pyc
     文件         953  2016-12-03 23:52  network_spider\html_manager.py
     文件        1692  2016-12-03 23:52  network_spider\html_manager.pyc
     文件         764  2016-12-03 23:52  network_spider\html_parser.py
     文件        1250  2016-12-03 23:52  network_spider\html_parser.pyc
     文件         124  2016-12-03 23:52  network_spider\keyword.txt
     文件        7271  2016-12-03 23:52  network_spider\main.py
     文件         854  2016-12-18 16:10  network_spider\mysql_manager.py
     文件        1438  2016-12-03 23:52  network_spider\mysql_manager.pyc
     文件         425  2016-12-04 14:41  network_spider\README.txt
     文件        6057  2016-12-03 23:52  network_spider\stopword.txt
     文件        2138  2016-12-03 23:52  network_spider\test.py
     文件        3035  2016-12-03 23:52  network_spider\test_similarity.py
     文件        3258  2016-12-03 23:52  network_spider\test_similarity.pyc
     文件      186458  2016-12-03 23:52  network_spider\test_strstrip.html
     文件           0  2016-12-03 23:52  network_spider\__init__.py

评论

共有 条评论

相关资源