大小: 6KB文件类型: .py金币: 2下载: 1 次发布日期: 2021-06-10
- 语言: Python
- 标签: word2vec tensorflow
#!/usr/bin/env python
#*- coding: utf-8 -*-
from __future__ import print_function
import os
import math
import urllib.request
import zipfile
import random
import collections
import numpy as np
import tensorflow as tf
url = ‘http://mattmahoney.net/dc/‘
‘‘‘Step1: download dataset‘‘‘
def may_download(filename expected_bytes):
if not os.path.exists(filename):
filename _ = urllib.request.urlretrieve(url + filename filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print(‘Found and verified‘ filename)
raise Exception(‘Failed to verify ‘ + filename)
return filename
filename = may_download(‘text8.zip‘ 31344016)
‘‘‘Step2: data transformation‘‘‘
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str_any(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
print(‘Datas size‘ len(words))
‘‘‘Step3: make dataset‘‘‘
vocabulary_size = 50000
def build_dataset(words):
count = [[‘UNK‘ -1]]
dictionary = dict()
for word _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
index = 0
unk_count += 1
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values() dictionary.keys()))
return data count dictionary reverse_dictionary
data count dictionary reverse_dictionary = build_dataset(words)
del words
print(‘Most common words (+UNK) ‘ count[:5])
print(‘Sample data‘ data[:10] [reverse_dictionary[i] for i in data[:10]])
‘‘‘Step4: generate training samples‘‘‘
data_index = 0
def generate_batch(batch_size num_skips skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= (2 *skip_window)
batch = np.ndarray(shape=(batch_size) dtype=np.int32)
labels = np.ndarray(shape=(batch_size 1) dtype=np.int32)
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)
for _ in range(span):
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0 span -1)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j 0] = buffer[target]
data_index = ( data_index + 1) % len(data)
return batch labels
batch labels = generate_batch(batch_size=8 num_skips=2 skip_window=1)
for i in range(8):
print(batch[i] reverse_dictionary[batch[i]] ‘->‘ labels[i 0] reverse_dictionary[labels[i 0]])
‘‘‘Step 5: training‘‘‘
batch_size = 128
embedding_size = 128
skip_window = 128
num_skips = 2
valid_size = 16
valid_window = 100
valid_examples =
- 上一篇:python37_d.lib文件
- 下一篇:mnist_normal
- tensorflow制作自己的灰度图像数据集并
- 维基百科中文语料word2vec训练后结果
- anaconda下安装tensorflow(注:不同版本
- 北京大学曹健老师-人工智能实践:
- Deep Learning With Python - Jason Brownlee
- Python-自然场景文本检测PSENet的一个
- Python-高效准确的EAST文本检测器的一个
- Python-TensorFlow弱监督图像分割
- Python-基于tensorflow实现的用textcnn方法
- Python-subpixel利用Tensorflow的一个子像素
- 【官方文档】TensorFlow Python API docume
tensorflow画风迁移代码 st
yle transfer - 简单粗暴 TensorFlow
- [PDF] Reinforcement Learning With Open AI Tens
- tensorflow目标检测代码
- 基于Python的手写字体识别系统
- 基于Tensorflow的人脸识别源码
- python TensorFlow 官方文档中文版
- Python-在TensorFlow中实现实现图像卷积网
- tensorflow-1.9.0-cp37-cp37m-win_amd64.whl
- Faster-RCNN-TensorFlow-Python3.5-master
- 聊天机器人tensorflow
- caffe模型转化为tensorflow模型
- Python-一个非常简单的BiLSTMCRF模型用于
- Python-Tensorflow仿AlphaGo框架实现的AI围棋
- Mask R-CNN源码(TensorFlow版本)
- 基于python3 tensorflow DBN_and_RNN的实现
- tensorflow-0.8.0-cp34-cp34m-linux_x86_64.whl
- Hands-On Machine Learning with Scikit-Learn an
- python3中文识别词库模型
共有 条评论