KNN-文本聚类

大小: 5.05MB

文件类型: .zip

金币: 2

下载: 0 次

发布日期: 2023-11-17
语言: 其他
标签: KNN

高速下载

资源简介

KNN文本聚类，下下来就能用，用于文本的无监督学习，注意内存

资源截图

小图大图

代码片段和文件信息


# coding: utf-8

# In[1]:

import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
from scipy.sparse import csr_matrix find
import numpy as np
import random
from sklearn.utils import shuffle
from sklearn.metrics import calinski_harabaz_score


def csr_build（dataIndex value nnz nrows）:
    ind = np.zeros（nnz dtype=np.int）
    val = np.zeros（nnz dtype=np.double）
    ptr = np.zeros（nrows+1 dtype=np.int）
    i = 0
    n = 0
    
    for （dv） in zip（dataIndex value）:
        l = len（d）
        for j in range（l）:
#             print j k
            ind[int（j） + n] = d[j]
            val[int（j） + n] = v[j]
        
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
    
    mat = csr_matrix（（val ind ptr） shape=（nrows max（ind）+1） dtype=np.double）
    mat.sort_indices（）
    
    return mat        

# scale matrix and normalize its rows
def csr_idf（mat copy=False **kargs）:
    r“““ Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True 
    returns scaled matrix and scaling factors.
    “““
    if copy is True:
        mat = mat.copy（）
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind val ptr = mat.indices mat.data mat.indptr
    # document frequency
    df = defaultdict（int）
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for kv in df.items（）:
        df[k] = np.log（nrows / float（v））  ## df turns to idf - reusing memory
    # scale by idf
    for i in range（0 nnz）:
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize（mat copy=False **kargs）:
    r“““ Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True returns a copy of the normalized matrix.
    “““
    if copy is True:
        mat = mat.copy（）
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind val ptr = mat.indices mat.data mat.indptr
    # normalize
    for i in range（nrows）:
        rsum = 0.0    
        for j in range（ptr[i] ptr[i+1]）:
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = float（1.0/np.sqrt（rsum））
        for j in range（ptr[i] ptr[i+1]）:
            val[j] *= rsum
            
    if copy is True:
        return mat



def initCentorids（x k）:
    x_shuffle = shuffle（x random_state=0）
    return x_shuffle[:k:]


# In[15]:

def sim（x1 x2）:
    sims = x1.dot（x2.T）
    return sims


# In[16]:

def findCentroids（mat centroids）:
    idx = list（）
    simsMatrix = sim（mat centroids）

    for i in range（simsMatrix.shape[0]）:
        row = simsMatrix.getrow（i）.toarray（）[0].ravel（）
        top_indices = row.argsort（）[-1]
        top_values = row[row.argsort（）[-1]]
#         print top_indices
        idx.append（top_indices + 1）
    return idx


def computeMeans（mat idx k）:
    centroids = list（）
    for i in range（1k+1）:
        indi = [j for j x in enumerate（idx） if x == i]
        members = mat[indi:]
        if

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2017-05-09 23:43  Text_Clustering-master\
     目录           0  2017-05-09 23:43  Text_Clustering-master\.ipynb_checkpoints\
     文件       10433  2017-05-09 23:43  Text_Clustering-master\.ipynb_checkpoints\Text_clustering-checkpoint.ipynb
     文件       47137  2017-05-09 23:43  Text_Clustering-master\1.png
     文件      198806  2017-05-09 23:43  Text_Clustering-master\HW6.pdf
     文件        3302  2017-05-09 23:43  Text_Clustering-master\README.md
     文件       20494  2017-05-09 23:43  Text_Clustering-master\Text_clustering.ipynb
     文件       17159  2017-05-09 23:43  Text_Clustering-master\output.dat.txt
     目录           0  2017-05-09 23:43  Text_Clustering-master\report\
     文件       94769  2017-05-09 23:43  Text_Clustering-master\report\README.pdf
     目录           0  2017-05-09 23:43  Text_Clustering-master\src\
     文件        4998  2017-05-09 23:43  Text_Clustering-master\src\Text_clustering.py
     文件     7369300  2017-05-09 23:43  Text_Clustering-master\src\train.dat.txt
     文件     7369300  2017-05-09 23:43  Text_Clustering-master\train.dat.txt

共有条评论

KNN-文本聚类

资源简介

资源截图

代码片段和文件信息

评论

相关资源