Python数据分析与机器学习-新闻分类任务

大小: 9.16MB

文件类型: .zip

金币: 2

下载: 0 次

发布日期: 2023-11-03
语言: Python
标签: python

高速下载

资源简介

Python数据分析与机器学习-新闻分类任务 Python数据分析与机器学习-新闻分类任务

资源截图

小图大图

代码片段和文件信息

import pandas as pd
import jieba
import numpy

pd.set_option（‘display.height‘ 9999）
pd.set_option（‘display.max_rows‘ 9999）
pd.set_option（‘display.max_columns‘ 9999）
pd.set_option（‘display.width‘ 9999）

df_news = pd.read_table（“./data/val.txt“ names=[‘category‘ ‘theme‘ ‘URL‘ ‘content‘] encoding=“utf-8“）
df_news = df_news.dropna（）#删除有缺失值的行
# print（df_news.shape）  # （5000 4）

content = df_news[“content“].values.tolist（）  # 新闻内容list
# print（content[1000]）
content_S = []  # 新闻内容分词之后的list
for line in content:
    current_segment = jieba.lcut（line）
    if len（current_segment） > 1 and current_segment != “\r\n“:#换行符
        content_S.append（current_segment）
# print（content_S[1000]）

df_content = pd.Dataframe（{“content_S“: content_S}）
# print（df_content.head（））

stopwords = pd.read_csv（“stopwords.txt“ index_col=False sep=“\t“ quoting=3 names=[‘stopword‘] encoding=‘utf-8‘）


def drop_stopwords（contents stopwords）:
    ‘‘‘去除新闻中的停用词‘‘‘
    contents_clean = []  # 新闻中去掉停用词
    all_words = []  # 所有词汇的集合（不包括停用词）
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append（word）
            all_words.append（word）
        contents_clean.append（line_clean）
    return contents_clean all_words


contents = df_content[“content_S“].values.tolist（）
stopwords = stopwords[“stopword“].values.tolist（）

contents_clean all_words = drop_stopwords（contents stopwords）

df_content = pd.Dataframe（{“contents_clean“: contents_clean}）
df_all_words = pd.Dataframe（{‘all_words‘: all_words}）

words_count = df_all_words.groupby（by=[‘all_words‘]）[‘all_words‘].agg（{“count“: numpy.size}）
words_count = words_count.reset_index（）.sort_values（by=[“count“] ascending=False）
# print（words_count.head（）.values）
‘‘‘
[[‘中‘ 5199]
 [‘中国‘ 3115]
 [‘说‘ 3055]
 [‘Ｓ‘ 2646]
 [‘万‘ 2390]]
‘‘‘
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams[‘figure.figsize‘] = （10.0 5.0）

wordcloud = WordCloud（font_path=“./data/simhei.ttf“ background_color=“white“ max_font_size=80）
word_frequence = {x[0]: x[1] for x in words_count.head（100）.values}
wordcloud = wordcloud.fit_words（word_frequence）
plt.imshow（wordcloud）
plt.show（）

‘‘‘TF-IDF ：提取关键词‘‘‘
import jieba.analyse

index = 2400
print（df_news[‘content‘][index]）
content_S_str = ““.join（content_S[index]）
print（“  “.join（jieba.analyse.extract_tags（content_S_str topK=5 withWeight=False）））

‘‘‘LDA ：主题模型‘‘‘
from gensim import corpora models similarities
import gensim

# 做映射，相当于词袋
dictionary = corpora.Dictionary（contents_clean）
corpus = [dictionary.doc2bow（sentence） for sentence in contents_clean]
lda = gensim.models.ldamodel.LdaModel（corpus=corpus id2word=dictionary num_topics=20）  # num_topics需要得到主题的数量
# 一号分类结果
print（lda.print_topic（1 topn=5））  # 第一类主题
for topic in lda.print_topics（num_topics=20 num_words=5

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2018-01-06 09:43  chapter16\
     目录           0  2018-01-05 20:38  chapter16\data\
     文件    10044356  2017-06-14 23:56  chapter16\data\simhei.ttf
     文件     9948878  2017-07-25 08:24  chapter16\data\val.txt
     文件      365370  2018-01-06 09:13  chapter16\show_Chinese.png
     文件       17672  2017-06-14 23:55  chapter16\stopwords.txt
     文件        5463  2017-03-03 12:00  chapter16\中文停用词库.txt
     文件        6038  2017-03-03 07:38  chapter16\哈工大停用词表.txt
     文件        8571  2017-03-03 12:00  chapter16\四川大学机器智能实验室停用词库.txt
     文件        5644  2018-01-06 09:43  chapter16\新闻分类任务.py

上一篇：centos7下 samba 4.8.3所有资源包含主文件和其它配套组件
下一篇：深度学习入门：基于python的理论与实践

共有条评论

Python数据分析与机器学习-新闻分类任务

资源简介

资源截图

代码片段和文件信息

评论

相关资源