-
大小: 18.23MB文件类型: .zip金币: 1下载: 0 次发布日期: 2023-07-04
- 语言: Python
- 标签:
资源简介
爬取百度百科中文页面,抽取三元组信息,构建中文知识图谱
代码片段和文件信息
import re
from scrapy.selector import Selector
import pickle
import glob
from pathlib import Path
import ossys
import threading
print(‘loading pages‘)
pages=glob.glob(‘../webpages/*‘)
print(‘loading pages done.‘)
savepath=‘./paged.bin‘
print(len(pages))
print(pages[0])
paged=[]
if os.path.exists(savepath):
paged=pickle.load(open(savepath‘rb‘))
print(‘load state‘)
lock=threading.Lock()
fail_file=open(‘./fail_para.txt‘‘w‘)
class MyThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self._running = True
def terminate(self):
self._running = False
def extract(selfpage):
#用Xpath提取出中的所有内容
line=Selector(text=open(page‘r‘).read()).xpath(‘//div[contains(@class “main-content“)]‘)
title=line.xpath(‘//h1//text()‘).extract()
para=re.sub(‘\[[0-9]+\]‘ ‘‘ ‘‘.join(word for word in line.xpath(‘//div[contains(@class “para“)]//text()‘).extract() if len(word)>1))
# print(para)
print(‘process file:‘+str(title))
output = open(‘./info-para/‘+‘‘.join(title).replace(‘/‘‘‘)+‘.txt‘‘w‘)
output.write(para)
output.close()
def run(self):
try:
while len(pages)>0 and self.running:
lock.acquire()
page=pages[0]
pages.remove(page)
lock.release()
self.extract(page)
lock.acquire()
paged.append(page)
lock.release()
except Exception as e:
print(‘fail to extract..‘str(e))
fail_file.write(page)
list_thread=[]
try:
print(‘start...‘)
for i in range(12):
list_thread.append(MyThread())
for th in list_thread:
th.start()
th.join()
except:
for th in list_thread:
th.terminate()
print(‘error!‘ sys.exc_info()[0])
finally:
print(‘save state‘)
pickle.dump(paged open(‘paged.bin‘ ‘wb‘))
fail_file.close()
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2019-06-10 02:25 WEB_KG-master\
文件 40 2019-06-10 02:25 WEB_KG-master\.gitignore
文件 1177 2019-06-10 02:25 WEB_KG-master\README.md
目录 0 2019-06-10 02:25 WEB_KG-master\ie\
文件 1788 2019-06-10 02:25 WEB_KG-master\ie\extract-para.py
文件 2281 2019-06-10 02:25 WEB_KG-master\ie\extract-table.py
目录 0 2019-06-10 02:25 WEB_KG-master\kg\
文件 1083 2019-06-10 02:25 WEB_KG-master\kg\build-triple-from-table.py
文件 1175 2019-06-10 02:25 WEB_KG-master\kg\insert_to_neo4j.py
文件 397289 2019-06-10 02:25 WEB_KG-master\kg\kg.png
文件 53091044 2019-06-10 02:25 WEB_KG-master\kg\triples.txt
目录 0 2019-06-10 02:25 WEB_KG-master\spider\
文件 1337 2019-06-10 02:25 WEB_KG-master\spider\html_downloader.py
文件 2366 2019-06-10 02:25 WEB_KG-master\spider\html_parser.py
文件 2189 2019-06-10 02:25 WEB_KG-master\spider\spider_main.py
文件 648 2019-06-10 02:25 WEB_KG-master\spider\url_manager.py
评论
共有 条评论