• 大小: 6KB
    文件类型: .py
    金币: 2
    下载: 1 次
    发布日期: 2021-06-10
  • 语言: Python
  • 标签: word2vec  tensorflow  

资源简介

word2vec的tensorflow实现,来自黄文坚的“tensorflow实战”

资源截图

代码片段和文件信息

#!/usr/bin/env python
#*- coding: utf-8 -*-
from __future__ import print_function
import os
import math
import urllib.request
import zipfile
import random
import collections
import numpy as np
import tensorflow as tf
url = ‘http://mattmahoney.net/dc/‘

‘‘‘Step1:  download dataset‘‘‘
def may_download(filename expected_bytes):
if not os.path.exists(filename):
filename _ = urllib.request.urlretrieve(url + filename filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print(‘Found and verified‘ filename)
else:
print(statinfo.st_size)
raise Exception(‘Failed to verify ‘ + filename)
return filename

filename = may_download(‘text8.zip‘ 31344016)

‘‘‘Step2: data transformation‘‘‘
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str_any(f.read(f.namelist()[0])).split()
return data

“““Test“““
words = read_data(filename)
print(‘Datas size‘ len(words))

‘‘‘Step3: make dataset‘‘‘
vocabulary_size = 50000

def build_dataset(words):
count = [[‘UNK‘ -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
dictionary = dict()
for word _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values() dictionary.keys()))

return data count dictionary reverse_dictionary

data count dictionary reverse_dictionary = build_dataset(words)
del words

“““Test“““
print(‘Most common words (+UNK) ‘ count[:5])
print(‘Sample data‘ data[:10] [reverse_dictionary[i] for i in data[:10]])

‘‘‘Step4: generate training samples‘‘‘
data_index = 0

def generate_batch(batch_size num_skips skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= (2 *skip_window)
batch = np.ndarray(shape=(batch_size) dtype=np.int32)
labels = np.ndarray(shape=(batch_size 1) dtype=np.int32)
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)

for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0 span -1)
targets_to_avoid.append(target)

batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j 0] = buffer[target]
buffer.append(data[data_index])
data_index = ( data_index + 1) % len(data)

return batch labels

“““Test“““
batch labels = generate_batch(batch_size=8 num_skips=2 skip_window=1)
for i in range(8):
print(batch[i] reverse_dictionary[batch[i]] ‘->‘ labels[i 0] reverse_dictionary[labels[i 0]])

‘‘‘Step 5: training‘‘‘
batch_size = 128
embedding_size = 128
skip_window = 128
num_skips = 2
valid_size = 16
valid_window = 100
valid_examples = 

评论

共有 条评论