资源简介
自然语言处理课程的小作业,以新闻语料为基础,用HMM算法实现中文分词。按照每个字为B,E,S,M进行划分。以新闻语料为基础,用HMM算法实现中文分词。按照每个字为B,E,S,M进行划分。
代码片段和文件信息
import numpy as np
import codecs
import re
import os
def write_file(all_wordAndffile_path):
# if not os.exist(file_path):
file=codecs.open(file_path “a“ encoding=“utf-8“)
all_word=[word for word in all_wordAndf]
fluency=[all_wordAndf[word] for word in all_word]
arg=np.argsort(fluency)
print(arg[0])
print(fluency[arg[0]])
length=len(arg)-1
for i in range(len(arg)):
file.write(all_word[arg[length-i]]+str(fluency[arg[length-i]]))
file.write(“\r“)
def getN_gram(filepath n):
result = {}
file= codecs.open(filepath “r“ encoding= “utf-8“)
c = “|“
# c作为间隔符
for line in file.readlines():
p=False
count=0
sentence=[]
word=““
for char in line:
if char is ‘/‘ :
# print(char)
count+=1
if not p:
p=True
word=“#“
sentence.append(word)
word = ““
else:
if not (char>=‘a‘ and char <=‘z‘) and char is not ‘ ‘ :
word+=char
# print(sentence)
for i in np.arange(0 len(sentence)-n):
n_word=““
for j in np.arange(i i+n):
n_word+= sentence[j]+c
n_word = n_word.strip()
if n_word in result:
result[n_word]+=1
else:
result[n_word]=1
write_file(result str(n)+“gram.txt“)
return result
all_wordAndf=getN_gram(“1998-01-2003版-带音.txt“ 2)
all_wordAndf_s=getN_gram(“1998-01-2003版-带音.txt“ 1)
def loadWordFluency_single(filepath):
result = {}
file = codecs.open(filepath “r“ encoding =“utf-8“)
i =0
for line in file.readlines():
i += 1
line = line.strip().split(‘|‘)
result[line[0]] = int(line[-1])
return result
def loadWordFluency_double(filepath):
result = {}
file = codecs.open(filepath “r“ encoding =“utf-8“)
i =0
for line in file.readlines():
i += 1
line = line.strip().split(‘|‘)
if len(line)!=3:
print(line)
result[line[0]] = int(line[-1])
result[line[0] + ‘|‘ + line[1]] = int(line[-1])
return result
gram_1 = loadWordFluency_single(“1gram.txt“)
gram_2 = loadWordFluency_double(“2gram.txt“)
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 2417 2017-10-27 13:09 Unti
文件 6100 2017-10-27 13:08 Unti
文件 11276940 2017-09-28 18:55 1998-01-2003版-带音.txt
----------- --------- ---------- ----- ----
11285457 3
- 上一篇:windows下pyltp的whl安装包
- 下一篇:pyltp安装wheel文件
评论
共有 条评论