资源简介
TensorFlow实战中实现word2vec代码(含中文注释)
代码片段和文件信息
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#%%
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License Version 2.0 (the “License“);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing software
# distributed under the License is distributed on an “AS IS“ BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
# Step 1: Download the data.
# 步骤一: 下载数据
url = ‘http://mattmahoney.net/dc/‘
def maybe_download(filename expected_bytes):
“““Download a file if not present and make sure it‘s the right size.“““
# 如果不存在该文件的话就下载该文件,并确保它的大小正确
if not os.path.exists(filename):
filename _ = urllib.request.urlretrieve(url + filename filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print(‘Found and verified‘ filename)
else:
print(statinfo.st_size)
raise Exception(
‘Failed to verify ‘ + filename + ‘. Can you get to it with a browser?‘)
return filename
filename = maybe_download(‘text8.zip‘ 31344016)
# Read the data into a list of strings.
# 把数据读取进一个字符串的列表
def read_data(filename):
“““Extract the first file enclosed in a zip file as a list of words“““
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
print(‘Data size‘ len(words))
# Step 2: Build the dictionary and replace rare words with UNK token.
# 步骤二: 构建一个词典,并把稀有词语用‘UNK‘代替
vocabulary_size = 50000
def build_dataset(words):
# 得到一个单词->词频的列表取词频最高的49999个
count = [[‘UNK‘ -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
#print(count[0])
#print(count[1])
#print(count[49999])
‘‘‘
i = 0
for word_ in count:
if word == ‘UNK‘:
i = i + 1
print(‘UNK‘ i ‘个‘)
‘‘‘
# 得到一个单词->编号的词典
dictionary = dict()
#print(len(dictionary))
for word _ in count:
dictionary[word] = len(dictionary)
#print(len(dictionary))
# 将全部单词转为编号,并统计UNK的词频
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary[‘UNK‘]
unk_count += 1
data.append(index)
# 将UNK的词频赋值
count[0][1] = unk_count
# 得到一个编号->单词的词典
reverse_dictionary = dict(zip(dictionary.values() dictionary.keys()))
# 返回转换后的编码列表、每个单词的频数统计、单词->编号的词典、编号->单词的词典
return data count dictionary reverse_dictionary
data count dictionary reverse_dictionary = build_dataset(words)
#print(len(count
相关资源
- [python]天气预报附带gui界面
- 基于GDAL的Python实现遥感影像PCA的代码
- SVM人脸识别的Python代码
- Python代码王者荣耀全皮肤图片
- 基于Python的SVM模块源代码
- 深度学习入门代码 5-1 mnist数据集.p
- 适合的新手-CNN代码
- BP算法Python代码
- python图像处理三维重建所有代码
- 超限学习机—逻辑回归Python代码
- 对任意关键字爬虫对应图片代码
- 网站图片爬取代码
- 协同过滤代码实现
- python五子棋代码
- 基于用户协同过滤usercf的python代码实
- 目标跟踪代码
- 基于lstm的语义相似度计算模型代码
- tensorflow2.0实现mnist手写数字识别代码
- 中间代码生成代码中缀表达式转换为
- Python源码剖析_代码(pythonympx.rar)
- python 战棋游戏六边形地图代码实现
- naive bayes代码实现(python版)
- Python3.x+Pyqt5实现绘图界面matplotlib绘图
- python-克里金插值 代码
- Pyboard利用两个Zigbee模块发送并接收
- Python从入门到精通(明日科技出版)
- 面向Arcgis的python脚本编程 中文教程英
- python火焰检测代码
- python从入门到实践课后试一试代码.
- Hopfield Neural Network——神经网络pytho
评论
共有 条评论