资源简介
利用十大经典机器学习算法之一的KNN(K近邻)算法,实现文本自动聚类。
代码片段和文件信息
from __future__ import print_function
import numpy as np
import time
import re
import os
import math
all=‘train.txt‘
latentpath=
normFlag=False
k=30
def file2matrix(filelisttfidfpath norm=False):
fr = open(filelist)
files = [line.strip() for line in fr.readlines()]
number_of_samples = len(files)
fr.close()
list_of_line=files[0].split()
FileTFIDF=os.path.join(tfidfpathfiles[0].split()[0])
fd=open(FileTFIDF‘r‘)
fea_dim=len(fd.readlines())
fd.close()
SampleMat = np.zeros((number_of_samplesfea_dim)dtype=float)
Label=np.zeros((number_of_samples)dtype=np.int)
fileindex = 0
for file in files:
list_of_line=file.split()
FileTFIDF=os.path.join(tfidfpathlist_of_line[0])
str1=re.sub
评论
共有 条评论