资源简介
基于MP最大概率的Ngram汉语切分(北邮计算机语言学基础)
有简洁的说明文档和python源代码
代码片段和文件信息
#!/user/bin/env python
# -*- coding: utf-8 -*-
import re
# 把切分点转化为 字编号序列
def line2order(line truth = 1):
pattern = re.compile(r‘[^/]*‘) # 正则表达式:读到 / 为止
orderlist = []
line = line.split()
order = 0
for i in range(len(line)-truth):
word = pattern.match(line[i + truth]).group()
order += len(word)
orderlist.append(order)
return orderlist
# 把字编号序列转化为 切分词的首尾编号
def cutlist(orderlist):
cut = []
second = orderlist[0]
for i in range(len(orderlist)-1):
first = second
second = orderlist[i+1]
cut.append( str(first)+str(second) )
return cut
# 计算单句话正确率,返回 [正确个数,总数]
def cal_acc(truecut mycut):
correctnum = 0
for i in truecut:
if i in mycut:
correctnum += 1
return correctnum
fin_true = open(‘final_ans.txt‘‘r‘encoding = ‘utf-8‘)
my_ans = open(‘ans.txt‘‘r‘encoding = ‘utf-8‘)
f = open(‘accuracy.txt‘ ‘w‘ encoding=‘utf-8‘)
correctnum = 0
allnump = 0
allnumr = 0
while True:
line = my_ans.readline()
if line:
truecut = cutlist(line2order(fin_true.readline() truth=0))
mycut = cutlist(line2order(line truth=0))
correctnum += cal_acc(truecut mycut)
allnump += len(mycut)
allnumr += len(truecut)
else:
break
p = correctnum/allnump
r = correctnum/allnumr
print(p r)
f.write(str( 2*p*r/(p+r) ))
f.close()
my_ans.close()
fin_true.close()
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 698616 2018-01-11 15:04 切分算法说明文档.docx
文件 28462 2017-11-30 12:41 result.txt
文件 6541 2017-11-30 12:23 MP.py
文件 1587 2017-11-29 19:01 accuracy.py
- 上一篇:动态规划代码
- 下一篇:HTMLTestRunnerNew.py
评论
共有 条评论