资源简介
利用十大经典机器学习算法之一的SVM(支持向量机)算法,实现文本分类,用于自然语言处理。
代码片段和文件信息
from __future__ import print_function
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics
import re
import os
import math
trainscp=‘train.txt‘
testscp=‘test.txt‘
outpath=‘result‘
latentpath=‘latent_topic‘
mode=‘a‘
resultfile=outpath + ‘RESULT‘ + ‘.txt‘
normflag=False
def normalizeModel_1d(a out=None):
if out is None: out = np.empty_like(a)
s = np.sum(a**2)
s = math.sqrt(s)
if s != 0.0 and len(a) != 1:
np.divide(a s out)
return out
def normalizeModel(M axis=0 out=None):
if len(M.shape) == 1: return normalizeModel_1d(M out)
if out is None: out = np.empty_like(M)
if axis == 0:
M = M.T
out = out.T
for i in range(len(M)):
normalizeModel_1d(M[i] out[i])
if axis == 0: out = out.T
return out
def file2matrix(filelisttfidfpath norm=normflag):
fr = open(filelist)
files = [line.strip() for line in fr.readlines()]
number_of_samples = len(files)
fr.close()
list_of_line=files[0].split()
FileTFIDF=os.path.join(tfidfpathfiles[0].split()[0])
fd=open(FileTFIDF‘r‘)
fea_dim=len(fd.readlines())
fd.close()
SampleMat = np.zeros((number_of_samplesfea_dim)dtype=float)
Label=np.zeros((number_of_samples)dtype=np.int)
fileindex = 0
for file in files:
list_of_line=file.split()
FileTFIDF=os.path.join(tfidfpathlist_of_line[0])
str1=re.subn(‘ENG‘‘‘list_of_line[1])
topicid=str1[0]
Label[fileindex]=topicid
fd=open(FileTFIDF‘r‘)
contents=fd.readlines()
fea_col=0
normValue = 0.0
for ele in contents:
SampleMat[fileindexfea_col]=float(ele)
if norm:
normValue += float(ele) ** 2
fea_col += 1
if norm:
normValue=math.sqrt(normValue)
for i in range(0fea_dim):
SampleMat[fileindexi]=SampleMat[fileindexi]/normValue
fileindex += 1
fd.close()
return SampleMatLabel
def file2matrix_v2(filestfidfpath norm=normflag):
number_of_samples = len(files)
list_of_line=files[0].split()
FileTFIDF=os.path.join(tfidfpathfiles[0].split()[0])
fd=open(FileTFIDF‘r‘)
fea_dim=len(fd.readlines())
fd.close()
SampleMat = np.zeros((number_of_samplesfea_dim)dtype=float)
Label=np.zeros((number_of_samples)dtype=np.int)
fileindex = 0
for file in files:
list_of_line=file.split()
FileTFIDF=os.path.join(tfidfpathlist_of_line[0])
str1=re.subn(‘ENG‘‘‘list_of_line[1])
topicid=str1[0]
Label[fileindex]=topicid
fd=open(FileTFIDF‘r‘)
contents=fd.readlines()
fea_col=0
normValue = 0.0
for ele in contents:
SampleMat[fil
- 上一篇:python图像处理.rar
- 下一篇:天天酷跑Python.docx
评论
共有 条评论