• 大小: 23.16MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-07-15
  • 语言: 其他
  • 标签: CCKS2017  

资源简介

CCK2017病例标注,CCKS2017 Task2 数据格式说明: 每个病例分为4个域,分别存储在4个文件夹 一般项目 病史特征 诊疗过程 出院情况 每一个目录下存储两类文件

资源截图

代码片段和文件信息

# coding:utf-8

import fio
import codecs
import sys
import os
import jieba.posseg as pseg

datadir = “../data2/training dataset v4“
area = [“病史特点“ “出院情况“ “一般项目“ “诊疗经过“]

class CRF_unit:
    def __init__(self):
        self.features = []

    def test_into_aline(self filename):
        self.features = []
        sentences = fio.ReadFileUTF8(filename);
        for sentence in sentences:
            for token in sentence:
                self.features.append(token)

    def get_posTag(self sentence):
        words = pseg.cut(sentence)
        return words

    def get_token(self filename):
        self.features = []
        sentences = fio.ReadFileUTF8(filename);
        for sentence in sentences:
            words = self.get_posTag(sentence)
            for w in words:
                for token in w.word:
                    feature = [token w.flag “N“]
                    self.features.append(feature)
                
    def read_type(self itype):
        itype = itype.encode(‘utf-8‘)
        if itype == “症状和体征“:
            return “SIGNS“
        if itype == “检查和检验“:
            return “CHECK“
        if itype == “疾病和诊断“:
            return “DISEASE“
        if itype == “治疗“:
            return “TREATMENT“
        if itype == “身体部位“:
            return “BODY“


    def get_type(self filename):
        sentences = fio.ReadFileUTF8(filename);
        for sentence in sentences:
            words = sentence.split()
            print words[-3] + words[-2]
            x = int(words[-3])
            y = int(words[-2])

            #if words[3].encode(‘utf-8‘) == “身体部位“:
            itype = self.read_type(words[-1])
            self.features[x][2] = “B-“ + itype
            for j in range(x+1y+1):
                self.features[j][2] = “I-“ + itype



if __name__ == ‘__main__‘:
    extractor = CRF_unit()
    x = 0;
    “““
    for i in range(1241):
        filename = datadir + ‘/‘ + area[x] + ‘/‘ + area[x] + ‘-‘+ str(i) +‘.txtoriginal.txt‘
        extractor.get_token(filename)

        filename = datadir + ‘/‘ + area[x] + ‘/‘ + area[x] + ‘-‘+ str(i) +‘.txt‘
        extractor.get_type(filename)

        filename = datadir + ‘/result/‘ + area[x] + “/“ + ‘1-240_train.txt‘
        fio.AddTrain(extractor.features filename)
    “““
    
    for i in range(241 301):
        filename = datadir + ‘/‘ + area[x] + ‘/‘ + area[x] + ‘-‘+ str(i) +‘.txtoriginal.txt‘
        extractor.test_into_aline(filename);

        filename = datadir + ‘/result/‘ + area[x] + ‘.testt-‘ + str(i) + ‘.txt‘
        fio.AddTest(extractor.features filename)
    




 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2017-11-22 09:15  CCKS2017\
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\
     文件          23  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\HEAD
     目录           0  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\branches\
     文件         268  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\config
     文件          73  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\description
     目录           0  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\
     文件         478  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\applypatch-msg.sample
     文件         896  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\commit-msg.sample
     文件         189  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\post-update.sample
     文件         424  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\pre-applypatch.sample
     文件        1642  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\pre-commit.sample
     文件        1348  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\pre-push.sample
     文件        4898  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\pre-rebase.sample
     文件        1239  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\prepare-commit-msg.sample
     文件        3610  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\hooks\update.sample
     文件     1960281  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\index
     目录           0  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\info\
     文件         240  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\info\exclude
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\
     文件         187  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\HEAD
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\refs\
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\refs\heads\
     文件         187  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\refs\heads\master
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\origin\
     文件         187  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\origin\HEAD
     目录           0  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\objects\
     目录           0  2017-08-09 10:14  CCKS2017\CCKS2017_dataset\.git\objects\info\
     目录           0  2017-08-09 10:18  CCKS2017\CCKS2017_dataset\.git\objects\pack\
............此处省略13886个文件信息

评论

共有 条评论

相关资源