• 大小: 5KB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2021-05-14
  • 语言: Python
  • 标签:

资源简介

2019年百度的三元组抽取比赛,一个baseline

资源截图

代码片段和文件信息

#! -*- coding:utf-8 -*-
# 2019年百度的三元组抽取比赛( http://lic2019.ccf.org.cn/kg ),一个baseline

import json
import numpy as np
from random import choice
from tqdm import tqdm


train_data = json.load(open(‘../datasets/train_data_me.json‘))
dev_data = json.load(open(‘../datasets/dev_data_me.json‘))
id2predicate predicate2id = json.load(open(‘../datasets/all_50_schemas_me.json‘))
id2predicate = {int(i):j for ij in id2predicate.items()}
id2char char2id = json.load(open(‘../datasets/all_chars_me.json‘))

char_size = 128
num_classes = len(id2predicate)


def seq_padding(X):
    L = [len(x) for x in X]
    ML = max(L)
    return [x + [0] * (ML - len(x)) for x in X]


class data_generator:
    def __init__(self data batch_size=64):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:
            idxs = range(len(self.data))
            np.random.shuffle(idxs)
            T S1 S2 K1 K2 O1 O2 = [] [] [] [] [] [] []
            for i in idxs:
                d = self.data[i]
                text = d[‘text‘]
                items = {}
                for sp in d[‘spo_list‘]:
                    subjectid = text.find(sp[0])
                    objectid = text.find(sp[2])
                    if subjectid != -1 and objectid != -1:
                        key = (subjectid subjectid+len(sp[0]))
                        if key not in items:
                            items[key] = []
                        items[key].append((objectid
                                           objectid+len(sp[2])
                                           predicate2id[sp[1]]))
                if items:
                    T.append([char2id.get(c 1) for c in text]) # 1是unk,0是padding
                    s1 s2 = [0] * len(text) [0] * len(text)
                    for j in items:
                        s1[j[0]] = 1
                        s2[j[1]-1] = 1
                    k1 k2 = choice(items.keys())
                    o1 o2 = [0] * len(text) [0] * len(text) # 0是unk类(共49+1个类)
                    for j in items[(k1 k2)]:
                        o1[j[0]] = j[2]
                        o2[j[1]-1] = j[2]
                    S1.append(s1)
                    S2.append(s2)
                    K1.append([k1])
                    K2.append([k2-1])
                    O1.append(o1)
                    O2.append(o2)
                    if len(T) == self.batch_size or i == idxs[-1]:
                        T = np.array(seq_padding(T))
                        S1 = np.array(seq_padding(S1))
                        S2 = np.array(seq_padding(S2))
                        O1 = np.array(seq_padding(O1))
                        O2 = np.array(seq_padding(O2))
                        K1 K2 = np.array(K1) np.array(K2)
                        yield [T S

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-06-03 15:58  kg-2019-baseline-master\
     文件        2161  2019-06-03 15:58  kg-2019-baseline-master\README.md
     文件        8835  2019-06-03 15:58  kg-2019-baseline-master\kg.py
     文件        1839  2019-06-03 15:58  kg-2019-baseline-master\trans.py

评论

共有 条评论