资源简介
CCK2017病例标注,CCKS2017 Task2
数据格式说明:
每个病例分为4个域,分别存储在4个文件夹
一般项目
病史特征
诊疗过程
出院情况
每一个目录下存储两类文件
代码片段和文件信息
# coding:utf-8
import fio
import codecs
import sys
import os
import jieba.posseg as pseg
datadir = “../data2/training dataset v4“
area = [“病史特点“ “出院情况“ “一般项目“ “诊疗经过“]
class CRF_unit:
def __init__(self):
self.features = []
def test_into_aline(self filename):
self.features = []
sentences = fio.ReadFileUTF8(filename);
for sentence in sentences:
for token in sentence:
self.features.append(token)
def get_posTag(self sentence):
words = pseg.cut(sentence)
return words
def get_token(self filename):
self.features = []
sentences = fio.ReadFileUTF8(filename);
for sentence in sentences:
words = self.get_posTag(sentence)
for w in words:
for token in w.word:
feature = [token w.flag “N“]
self.features.append(feature)
def read_type(self itype):
itype = itype.encode(‘utf-8‘)
if itype == “症状和体征“:
return “SIGNS“
if itype == “检查和检验“:
return “CHECK“
if itype == “疾病和诊断“:
return “DISEASE“
if itype == “治疗“:
return “TREATMENT“
if itype == “身体部位“:
return “BODY“
def get_type(self filename):
sentences = fio.ReadFileUTF8(filename);
for sentence in sentences:
words = sentence.split()
print words[-3] + words[-2]
x = int(words[-3])
y = int(words[-2])
#if words[3].encode(‘utf-8‘) == “身体部位“:
itype = self.read_type(words[-1])
self.features[x][2] = “B-“ + itype
for j in range(x+1y+1):
self.features[j][2] = “I-“ + itype
if __name__ == ‘__main__‘:
extractor = CRF_unit()
x = 0;
“““
for i in range(1241):
filename = datadir + ‘/‘ + area[x] + ‘/‘ + area[x] + ‘-‘+ str(i) +‘.txtoriginal.txt‘
extractor.get_token(filename)
filename = datadir + ‘/‘ + area[x] + ‘/‘ + area[x] + ‘-‘+ str(i) +‘.txt‘
extractor.get_type(filename)
filename = datadir + ‘/result/‘ + area[x] + “/“ + ‘1-240_train.txt‘
fio.AddTrain(extractor.features filename)
“““
for i in range(241 301):
filename = datadir + ‘/‘ + area[x] + ‘/‘ + area[x] + ‘-‘+ str(i) +‘.txtoriginal.txt‘
extractor.test_into_aline(filename);
filename = datadir + ‘/result/‘ + area[x] + ‘.testt-‘ + str(i) + ‘.txt‘
fio.AddTest(extractor.features filename)
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-11-22 09:15 CCKS2017\
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\
文件 23 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\HEAD
目录 0 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\branches\
文件 268 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\config
文件 73 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\desc
目录 0 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\
文件 478 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\applypatch-msg.sample
文件 896 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\commit-msg.sample
文件 189 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\post-update.sample
文件 424 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\pre-applypatch.sample
文件 1642 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\pre-commit.sample
文件 1348 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\pre-push.sample
文件 4898 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\pre-reba
文件 1239 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\prepare-commit-msg.sample
文件 3610 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\hooks\update.sample
文件 1960281 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\index
目录 0 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\info\
文件 240 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\info\exclude
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\
文件 187 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\HEAD
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\refs\
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\refs\heads\
文件 187 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\refs\heads\master
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\origin\
文件 187 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\origin\HEAD
目录 0 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\ob
目录 0 2017-08-09 10:14 CCKS2017\CCKS2017_dataset\.git\ob
目录 0 2017-08-09 10:18 CCKS2017\CCKS2017_dataset\.git\ob
............此处省略13886个文件信息
评论
共有 条评论