资源简介
内含手机中文评论数据集(商品编号和评论),贝叶斯算法中文评论分类代码,数据集+代码
代码片段和文件信息
import jieba
import pandas as pd
from sklearn import metricsnaive_bayes
from sklearn.feature_extraction.text import CountVectorizer#计算词频
file=open(‘商品编号和评论.txt‘‘r‘encoding=‘UTF-8‘)
rows=file.readline()
def main():
reviews=[]#评论
score=[]#评分
num=0
while rows:
line = file.readline()
ls = line.split(‘‘)
if len(ls)==5:
reviews.append(ls[2])
score.append(ls[3])
if num==10000:
break
num+=1
df=pd.Dataframe({
‘reviews‘:reviews
‘score‘:score
})
df=df[[‘reviews‘‘score‘]]#修改列名
print(df)
# print(df.info())
# print(df.ix[0])
df[‘score‘]=df[‘score‘].str.extract(r‘(\d+)‘expand=False).astype(int)
# print(df.info())
# print(sum(df[‘score‘].isnull()))
df[‘score‘]=df[‘score‘].apply(score_get)
col=df.iloc[:0]
arrs=col.values
result=[]#过滤完成的结果
stopwords={}.fromkeys([‘,‘‘!‘‘。‘‘、‘‘?‘‘~‘])
for a in arrs:
seglist=jieba.lcut(acut_all=False)
final=‘‘
for seg in seglist:
if seg not in stopwords:
final+=seg
seglist=jieba.lcut(finalcut_all=False)
output=‘ ‘.join(list(seglist))#每行过滤出的结果
result.append(output)
#
vectorize=CountVectorizer()#new
# word=vectorize.get_feature_names()#查看
# for i in word:
# print(i)
X=vectorize.fit_transform(result)#计算过滤后列表词频率
X=X.toarray()#转为列表
# print(X)
x_train=X[:-1000]
y_train=df.iloc[:-10001]
x_test = X[-1000:]
y_test=df.iloc[-1000:1]
nb=naive_bayes.BernoulliNB()
nb.fit(x_trainy_train)
nb_pre=nb.predict(x_test)
print(nb_pre)
word = vectorize.get_feature_names()
for w in word:
print(w)
accuracy = metrics.accuracy_score(y_test nb_pre)
print(‘分类准确率:‘accuracy)
def score_get(x):
if x<=3:
return 1
else:
return 2
if __name__ == ‘__main__‘:
main()
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 135 2018-12-12 15:25 suanfa\.idea\encodings.xm
文件 295 2018-12-12 15:25 suanfa\.idea\misc.xm
文件 264 2018-12-12 15:25 suanfa\.idea\modules.xm
文件 438 2018-12-12 15:25 suanfa\.idea\suanfa.iml
文件 9188 2018-12-12 15:25 suanfa\.idea\workspace.xm
文件 2112 2018-12-13 10:43 suanfa\suanfa.py
文件 75224050 2018-12-13 09:29 suanfa\商品编号和评论.txt
目录 0 2018-12-13 10:44 suanfa\.idea
目录 0 2018-12-13 10:44 suanfa
----------- --------- ---------- ----- ----
75236482 9
- 上一篇:体育用品售卖网站
- 下一篇:TMDB电影数据分析
相关资源
- Handbook of approximate Bayesian computation.p
- Statistical Rethinking A Bayesian Course with
- Matrix Variate Distributions
- 贝叶斯统计_第2版_茆诗松_汤银才
- 数据挖掘贝叶斯分类bayes算法
- 基于贝叶斯的盲源反卷积十分完备R
- 基于NavieBayes的adaboost算法实现
- 贝叶斯压缩感知代码
- 简单贝叶斯实现垃圾邮件分类
- 压缩感知稀疏贝叶斯算法
- Bayesian Data Analysis third edition pdf
- Modeling and Reasoning with Bayesian Networks(
- bayesian分类器
- NaiveBayesClassify朴素贝叶斯分类法-Map
- Bayesian and HSMM
评论
共有 条评论