• 大小: 6.08MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-09-24
  • 语言: Python
  • 标签:

资源简介

Sequential Event Experiment based on Travel note crawled from XieCheng,基于50W携程出行游记的采集与顺承事件图谱构建

资源截图

代码片段和文件信息

#!/usr/bin/env python3
# coding: utf-8
# File: pattern.py
# Author: lhy
# Date: 18-7-15

import pymongo
import re
import jieba
from sentence_parser import *

class EventGraph:
    def __init__(self):
        conn = pymongo.MongoClient()
        self.pattern = re.compile(r‘(.*)(其次|然后|接着|随后|接下来)(.*)‘)
        self.col = conn[‘travel‘][‘doc‘]
        self.col_insert = conn[‘travel‘][‘events‘]
        self.parse_handler = LtpParser()

    ‘‘‘长句切分‘‘‘
    def seg_long_sents(self content):
        return [sentence for sentence in re.split(r‘[??!!。;;::\n\r….·]‘ content.replace(‘ ‘‘‘).replace(‘\u3000‘‘‘)) if len(sentence) > 5]

    ‘‘‘短句切分‘‘‘
    def process_subsent(self content):
        return [s for s in re.split(r‘[、,和与及且跟()~▲.]‘ content) if len(s)>1]

    ‘‘‘处理数据库中的文本‘‘‘
    def process_doc(self):
        count = 0
        for item in self.col.find():
            content = item[‘content‘]
            events_all = self.collect_event(content)
            if events_all:
                data = {}
                data[‘events‘] = events_all
                self.col_insert.insert(data)
            else:
                continue

    ‘‘‘统计收集EVENT‘‘‘
    def collect_event(self content):
        events_all = []
        sents= self.seg_long_sents(content)
        for sent in sents:
            events = self.event_extract(sent)
            if events:
                events_all.append(events)
        return events_all

    ‘‘‘顺承事件抽取‘‘‘
    def event_extract(self sent):
        result = self.pattern.findall(sent)
        if result:
            event_seqs = []
            for tmp in result:
                pre = tmp[0]
                post = tmp[2]
                pre_sents = self.process_subsent(pre)
                post_sents = self.process_subsent(post)
                if pre_sents and post_sents:
                    event_seqs += pre_sents
                    event_seqs += post_sents
                else:
                    continue
            ‘‘‘对事件进行结构化‘‘‘
            if event_seqs:
                events = self.extract_phrase(event_seqs)
                return events
            else:
                pass
        return []


    ‘‘‘将一个长句中的句子进行分解,提取出其中的vob短语‘‘‘
    def extract_phrase(self event_seqs):
        events = []
        for event in event_seqs:
            vobs = self.vob_exract(event)
            if vobs:
                events += vobs
        return events

    ‘‘‘提取VOB关系‘‘‘
    def vob_exract(self content):
        vobs = []
        words = list(jieba.cut(content))
        if len(words) >= 300:
            return []
        postags = self.parse_handler.get_postag(words)
        tuples child_dict_list = self.parse_handler.parser_main(words postags)
        for tuple in tuples:
            rel = tuple[-1]
            pos_verb= tuple[4][0]
            pos_object = tuple[2][0]
            if rel == ‘VOB‘ and (pos_verb pos_object) in [(‘v‘ ‘n‘) (‘v‘ ‘i‘)]:
                phrase = ‘‘.

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\
     文件          93  2018-12-15 05:16  SequentialEventExtration-master\.gitattributes
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\.idea\
     文件         398  2018-12-15 05:16  SequentialEventExtration-master\.idea\SequentialEventGraph.iml
     文件         706  2018-12-15 05:16  SequentialEventExtration-master\.idea\misc.xml
     文件         292  2018-12-15 05:16  SequentialEventExtration-master\.idea\modules.xml
     文件         180  2018-12-15 05:16  SequentialEventExtration-master\.idea\vcs.xml
     文件       20948  2018-12-15 05:16  SequentialEventExtration-master\.idea\workspace.xml
     文件        7111  2018-12-15 05:16  SequentialEventExtration-master\README.md
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\event_graph\
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\event_graph\VIS\
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\event_graph\VIS\dist\
     文件       30798  2018-12-15 05:16  SequentialEventExtration-master\event_graph\VIS\dist\vis.css
     文件     1532584  2018-12-15 05:16  SequentialEventExtration-master\event_graph\VIS\dist\vis.js
     文件      781766  2018-12-15 05:16  SequentialEventExtration-master\event_graph\VIS\dist\vis.map
     文件       22008  2018-12-15 05:16  SequentialEventExtration-master\event_graph\VIS\dist\vis.min.css
     文件      582497  2018-12-15 05:16  SequentialEventExtration-master\event_graph\VIS\dist\vis.min.js
     文件        3312  2018-12-15 05:16  SequentialEventExtration-master\event_graph\event_extract.py
     文件        4001  2018-12-15 05:16  SequentialEventExtration-master\event_graph\event_graph.py
     文件        7007  2018-12-15 05:16  SequentialEventExtration-master\event_graph\sentence_parser.py
     文件    13643483  2018-12-15 05:16  SequentialEventExtration-master\event_graph\seq_events.txt
     文件       55935  2018-12-15 05:16  SequentialEventExtration-master\event_graph\travel_event_graph.html
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\image\
     文件      297401  2018-12-15 05:16  SequentialEventExtration-master\image\all.png
     文件       54270  2018-12-15 05:16  SequentialEventExtration-master\image\book.png
     文件       45649  2018-12-15 05:16  SequentialEventExtration-master\image\food.png
     文件      213664  2018-12-15 05:16  SequentialEventExtration-master\image\graph.png
     文件       86232  2018-12-15 05:16  SequentialEventExtration-master\image\plane.png
     文件       99399  2018-12-15 05:16  SequentialEventExtration-master\image\train.png
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\news_spider\
     目录           0  2018-12-15 05:16  SequentialEventExtration-master\news_spider\.idea\
............此处省略22个文件信息

评论

共有 条评论