-
大小: 5KB文件类型: .zip金币: 1下载: 0 次发布日期: 2021-05-14
- 语言: Python
- 标签:
资源简介
2019年百度的三元组抽取比赛,一个baseline
代码片段和文件信息
#! -*- coding:utf-8 -*-
# 2019年百度的三元组抽取比赛( http://lic2019.ccf.org.cn/kg ),一个baseline
import json
import numpy as np
from random import choice
from tqdm import tqdm
train_data = json.load(open(‘../datasets/train_data_me.json‘))
dev_data = json.load(open(‘../datasets/dev_data_me.json‘))
id2predicate predicate2id = json.load(open(‘../datasets/all_50_schemas_me.json‘))
id2predicate = {int(i):j for ij in id2predicate.items()}
id2char char2id = json.load(open(‘../datasets/all_chars_me.json‘))
char_size = 128
num_classes = len(id2predicate)
def seq_padding(X):
L = [len(x) for x in X]
ML = max(L)
return [x + [0] * (ML - len(x)) for x in X]
class data_generator:
def __init__(self data batch_size=64):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = range(len(self.data))
np.random.shuffle(idxs)
T S1 S2 K1 K2 O1 O2 = [] [] [] [] [] [] []
for i in idxs:
d = self.data[i]
text = d[‘text‘]
items = {}
for sp in d[‘spo_list‘]:
subjectid = text.find(sp[0])
objectid = text.find(sp[2])
if subjectid != -1 and objectid != -1:
key = (subjectid subjectid+len(sp[0]))
if key not in items:
items[key] = []
items[key].append((objectid
objectid+len(sp[2])
predicate2id[sp[1]]))
if items:
T.append([char2id.get(c 1) for c in text]) # 1是unk,0是padding
s1 s2 = [0] * len(text) [0] * len(text)
for j in items:
s1[j[0]] = 1
s2[j[1]-1] = 1
k1 k2 = choice(items.keys())
o1 o2 = [0] * len(text) [0] * len(text) # 0是unk类(共49+1个类)
for j in items[(k1 k2)]:
o1[j[0]] = j[2]
o2[j[1]-1] = j[2]
S1.append(s1)
S2.append(s2)
K1.append([k1])
K2.append([k2-1])
O1.append(o1)
O2.append(o2)
if len(T) == self.batch_size or i == idxs[-1]:
T = np.array(seq_padding(T))
S1 = np.array(seq_padding(S1))
S2 = np.array(seq_padding(S2))
O1 = np.array(seq_padding(O1))
O2 = np.array(seq_padding(O2))
K1 K2 = np.array(K1) np.array(K2)
yield [T S
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2019-06-03 15:58 kg-2019-ba
文件 2161 2019-06-03 15:58 kg-2019-ba
文件 8835 2019-06-03 15:58 kg-2019-ba
文件 1839 2019-06-03 15:58 kg-2019-ba
评论
共有 条评论