TensorFlow实战中实现word2vec代码含中文注释

大小:

文件类型: .py

金币: 1

下载: 0 次

发布日期: 2021-06-03
语言: Python
标签: word2vec 代码

高速下载

资源简介

TensorFlow实战中实现word2vec代码（含中文注释）

资源截图

小图大图

代码片段和文件信息

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

#%%
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License Version 2.0 （the “License“）;
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing software
# distributed under the License is distributed on an “AS IS“ BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import collections
import math
import os
import random
import zipfile

import numpy as np
import urllib
import tensorflow as tf

# Step 1: Download the data.
# 步骤一: 下载数据
url = ‘http://mattmahoney.net/dc/‘

def maybe_download（filename expected_bytes）:
  “““Download a file if not present and make sure it‘s the right size.“““
  # 如果不存在该文件的话就下载该文件，并确保它的大小正确
  if not os.path.exists（filename）:
    filename _ = urllib.request.urlretrieve（url + filename filename）
  statinfo = os.stat（filename）
  if statinfo.st_size == expected_bytes:
    print（‘Found and verified‘ filename）
  else:
    print（statinfo.st_size）
    raise Exception（
        ‘Failed to verify ‘ + filename + ‘. Can you get to it with a browser?‘）
  return filename

filename = maybe_download（‘text8.zip‘ 31344016）

# Read the data into a list of strings.
# 把数据读取进一个字符串的列表
def read_data（filename）:
  “““Extract the first file enclosed in a zip file as a list of words“““
  with zipfile.ZipFile（filename） as f:
    data = tf.compat.as_str（f.read（f.namelist（）[0]））.split（）
  return data

words = read_data（filename）
print（‘Data size‘ len（words））

# Step 2: Build the dictionary and replace rare words with UNK token.
# 步骤二: 构建一个词典，并把稀有词语用‘UNK‘代替
vocabulary_size = 50000

def build_dataset（words）:
  # 得到一个单词->词频的列表取词频最高的49999个
  count = [[‘UNK‘ -1]]
  count.extend（collections.Counter（words）.most_common（vocabulary_size - 1））
  #print（count[0]）
  #print（count[1]）
  #print（count[49999]）
  ‘‘‘
  i = 0
  for word_ in count:
      if word == ‘UNK‘:
          i = i + 1
  print（‘UNK‘ i ‘个‘）
  ‘‘‘
  # 得到一个单词->编号的词典
  dictionary = dict（）
  #print（len（dictionary））
  for word _ in count:
    dictionary[word] = len（dictionary）
  #print（len（dictionary））
  # 将全部单词转为编号，并统计UNK的词频
  data = list（）
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary[‘UNK‘]
      unk_count += 1
    data.append（index）
  # 将UNK的词频赋值
  count[0][1] = unk_count
  # 得到一个编号->单词的词典
  reverse_dictionary = dict（zip（dictionary.values（） dictionary.keys（）））
  # 返回转换后的编码列表、每个单词的频数统计、单词->编号的词典、编号->单词的词典
  return data count dictionary reverse_dictionary

data count dictionary reverse_dictionary = build_dataset（words）
#print（len（count

上一篇：Python人工智能AI深度学习全套课程.txt
下一篇：k匿名隐私保护算法python版

共有条评论

TensorFlow实战中实现word2vec代码含中文注释

资源简介

资源截图

代码片段和文件信息

评论

相关资源