资源简介
KNN文本聚类,下下来就能用,用于文本的无监督学习,注意内存
代码片段和文件信息
# coding: utf-8
# In[1]:
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
from scipy.sparse import csr_matrix find
import numpy as np
import random
from sklearn.utils import shuffle
from sklearn.metrics import calinski_harabaz_score
def csr_build(dataIndex value nnz nrows):
ind = np.zeros(nnz dtype=np.int)
val = np.zeros(nnz dtype=np.double)
ptr = np.zeros(nrows+1 dtype=np.int)
i = 0
n = 0
for (dv) in zip(dataIndex value):
l = len(d)
for j in range(l):
# print j k
ind[int(j) + n] = d[j]
val[int(j) + n] = v[j]
ptr[i+1] = ptr[i] + l
n += l
i += 1
mat = csr_matrix((val ind ptr) shape=(nrows max(ind)+1) dtype=np.double)
mat.sort_indices()
return mat
# scale matrix and normalize its rows
def csr_idf(mat copy=False **kargs):
r“““ Scale a CSR matrix by idf.
Returns scaling factors as dict. If copy is True
returns scaled matrix and scaling factors.
“““
if copy is True:
mat = mat.copy()
nrows = mat.shape[0]
nnz = mat.nnz
ind val ptr = mat.indices mat.data mat.indptr
# document frequency
df = defaultdict(int)
for i in ind:
df[i] += 1
# inverse document frequency
for kv in df.items():
df[k] = np.log(nrows / float(v)) ## df turns to idf - reusing memory
# scale by idf
for i in range(0 nnz):
val[i] *= df[ind[i]]
return df if copy is False else mat
def csr_l2normalize(mat copy=False **kargs):
r“““ Normalize the rows of a CSR matrix by their L-2 norm.
If copy is True returns a copy of the normalized matrix.
“““
if copy is True:
mat = mat.copy()
nrows = mat.shape[0]
nnz = mat.nnz
ind val ptr = mat.indices mat.data mat.indptr
# normalize
for i in range(nrows):
rsum = 0.0
for j in range(ptr[i] ptr[i+1]):
rsum += val[j]**2
if rsum == 0.0:
continue # do not normalize empty rows
rsum = float(1.0/np.sqrt(rsum))
for j in range(ptr[i] ptr[i+1]):
val[j] *= rsum
if copy is True:
return mat
def initCentorids(x k):
x_shuffle = shuffle(x random_state=0)
return x_shuffle[:k:]
# In[15]:
def sim(x1 x2):
sims = x1.dot(x2.T)
return sims
# In[16]:
def findCentroids(mat centroids):
idx = list()
simsMatrix = sim(mat centroids)
for i in range(simsMatrix.shape[0]):
row = simsMatrix.getrow(i).toarray()[0].ravel()
top_indices = row.argsort()[-1]
top_values = row[row.argsort()[-1]]
# print top_indices
idx.append(top_indices + 1)
return idx
def computeMeans(mat idx k):
centroids = list()
for i in range(1k+1):
indi = [j for j x in enumerate(idx) if x == i]
members = mat[indi:]
if
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-05-09 23:43 Text_Clustering-master\
目录 0 2017-05-09 23:43 Text_Clustering-master\.ipynb_checkpoints\
文件 10433 2017-05-09 23:43 Text_Clustering-master\.ipynb_checkpoints\Text_clustering-checkpoint.ipynb
文件 47137 2017-05-09 23:43 Text_Clustering-master\1.png
文件 198806 2017-05-09 23:43 Text_Clustering-master\HW6.pdf
文件 3302 2017-05-09 23:43 Text_Clustering-master\README.md
文件 20494 2017-05-09 23:43 Text_Clustering-master\Text_clustering.ipynb
文件 17159 2017-05-09 23:43 Text_Clustering-master\output.dat.txt
目录 0 2017-05-09 23:43 Text_Clustering-master\report\
文件 94769 2017-05-09 23:43 Text_Clustering-master\report\README.pdf
目录 0 2017-05-09 23:43 Text_Clustering-master\src\
文件 4998 2017-05-09 23:43 Text_Clustering-master\src\Text_clustering.py
文件 7369300 2017-05-09 23:43 Text_Clustering-master\src\train.dat.txt
文件 7369300 2017-05-09 23:43 Text_Clustering-master\train.dat.txt
- 上一篇:企业微信开发升级版
- 下一篇:微信小程序源码带秒杀
相关资源
- 人车分类识别 HOG特征+KNN分类器
- knn文本聚类小型数据集
- 决策树算法原理详解
- 机器学习实战之K邻近算法
- 文本分类器,KNNSVM贝叶斯等都有
- R语言分类(SVM KNN LDA等)与回归代码
- KNN算法与Tensorflow分别实现的手写识别
- 一种基于层次分析法的改进KNN算法
- 最邻近算法KNN识别字符
- opencv 的knn的训练数据
- stm32单片机knn算法手写数字识别
- BOWBag of words,词袋模型代码实现
- 数码管数别字识--KNN算法
- KNN疾病预测算法Demo
- PCA+KNN人脸表情识别
- KNN实现手写数字识别
- datingTestSet2数据集
- 用于knn分类的CIFAR-10数据集
- KNN算法实现手写数字识别的三种方法
- 手写体数字识别系统实现
- minst数据集机器学习练习
- 用KNN算法诊断乳腺癌
- KNN算法诊断乳腺癌
- KNN实现代码+数据可视化+决策边界
- knn手写数字识别training及test数据集
- Knn测试数据
- KNN算法预测鸢尾花的种类,源码以及
- kNN分类算法数据集
- 多标记分类器MLKNN
- KNN识别LED数码管
评论
共有 条评论