SVM实现文本分类代码

大小: 7KB

文件类型: .py

金币: 1

下载: 0 次

发布日期: 2021-05-15
语言: Python
标签:

高速下载

资源简介

利用十大经典机器学习算法之一的SVM（支持向量机）算法，实现文本分类，用于自然语言处理。

资源截图

小图大图

代码片段和文件信息

from __future__ import print_function
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn import metrics
import re
import os
import math

trainscp=‘train.txt‘
testscp=‘test.txt‘
outpath=‘result‘
latentpath=‘latent_topic‘

mode=‘a‘
resultfile=outpath + ‘RESULT‘  + ‘.txt‘
normflag=False

def normalizeModel_1d（a out=None）:  
    if out is None: out = np.empty_like（a）
    s = np.sum（a**2）
    s = math.sqrt（s）
    if s != 0.0 and len（a） != 1:
        np.divide（a s out）
    return out

def normalizeModel（M axis=0 out=None）:
    if len（M.shape） == 1: return normalizeModel_1d（M out）
    if out is None: out = np.empty_like（M）
    if axis == 0:
        M = M.T 
        out = out.T

    for i in range（len（M））: 
        normalizeModel_1d（M[i] out[i]）  

    if axis == 0: out = out.T

    return out
        
def file2matrix（filelisttfidfpath norm=normflag）:
    fr = open（filelist）         
    files = [line.strip（） for line in fr.readlines（）]
    number_of_samples = len（files） 
    fr.close（）
    list_of_line=files[0].split（）
    FileTFIDF=os.path.join（tfidfpathfiles[0].split（）[0]）
    fd=open（FileTFIDF‘r‘）
    fea_dim=len（fd.readlines（））
    fd.close（）
    SampleMat = np.zeros（（number_of_samplesfea_dim）dtype=float）
    Label=np.zeros（（number_of_samples）dtype=np.int）
    fileindex = 0
    for file in files:
        list_of_line=file.split（）
        FileTFIDF=os.path.join（tfidfpathlist_of_line[0]）
        str1=re.subn（‘ENG‘‘‘list_of_line[1]）
        topicid=str1[0]
        Label[fileindex]=topicid
        fd=open（FileTFIDF‘r‘）
        contents=fd.readlines（）
        fea_col=0
        normValue = 0.0
        for ele in contents:
            SampleMat[fileindexfea_col]=float（ele）
            if norm:
                normValue += float（ele） ** 2
            fea_col += 1
        if norm:
            normValue=math.sqrt（normValue）
            for i in range（0fea_dim）:
                SampleMat[fileindexi]=SampleMat[fileindexi]/normValue
        fileindex += 1      
      
        fd.close（）

    return SampleMatLabel

def file2matrix_v2（filestfidfpath norm=normflag）:
    number_of_samples = len（files） 
    list_of_line=files[0].split（）
    FileTFIDF=os.path.join（tfidfpathfiles[0].split（）[0]）
    fd=open（FileTFIDF‘r‘）
    fea_dim=len（fd.readlines（））
    fd.close（）
    SampleMat = np.zeros（（number_of_samplesfea_dim）dtype=float）
    Label=np.zeros（（number_of_samples）dtype=np.int）
    fileindex = 0
    for file in files:
        list_of_line=file.split（）
        FileTFIDF=os.path.join（tfidfpathlist_of_line[0]）
        str1=re.subn（‘ENG‘‘‘list_of_line[1]）
        topicid=str1[0]
        Label[fileindex]=topicid
        fd=open（FileTFIDF‘r‘）
        contents=fd.readlines（）
        fea_col=0
        normValue = 0.0
        for ele in contents:
            SampleMat[fil

共有条评论

SVM实现文本分类代码

资源简介

资源截图

代码片段和文件信息

评论

相关资源