• 大小: 39.11MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-07-24
  • 语言: Java
  • 标签: DOC2VEC  

资源简介

DOC2vec,是为一群用来产生词向量的相关模型。这些模型为浅而双层的神经网络,用来训练以重新建构语言学之词文本。

资源截图

代码片段和文件信息

package com.ansj.vec;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import love.cq.util.MapCount;

import com.ansj.vec.domain.HiddenNeuron;
import com.ansj.vec.domain.Neuron;
import com.ansj.vec.domain.WordNeuron;
import com.ansj.vec.util.Haffman;

public class Learn {

private Map wordMap = new HashMap<>();

/**
 * 训练多少个特征
 */
private int layerSize = 200;

/**
 * 上下文窗口大小
 */
private int window = 5;

private double sample = 1e-3;
private double alpha = 0.025;
private double startingAlpha = alpha;

public int EXP_TABLE_SIZE = 1000;

private Boolean isCbow = false;

private double[] expTable = new double[EXP_TABLE_SIZE];

private int trainWordsCount = 0;

private int MAX_EXP = 6;

private int freqThresold = 5;

public Learn(Boolean isCbow Integer layerSize Integer window
Double alpha Double sample) {
createExpTable();
if (isCbow != null) {
this.isCbow = isCbow;
}
if (layerSize != null)
this.layerSize = layerSize;
if (window != null)
this.window = window;
if (alpha != null)
this.alpha = alpha;
if (sample != null)
this.sample = sample;
}

public Learn() throws IOException {
createExpTable();

}

/**
 * trainModel
 * 
 * @throws IOException
 */
private void trainModel(File file) throws IOException {
try (BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(file)))) {
String temp = null;
long nextRandom = 5;
int wordCount = 0;
int lastWordCount = 0;
int wordCountActual = 0;
while ((temp = br.readLine()) != null) {
if (wordCount - lastWordCount > 10000) {
System.out.println(“alpha:“
+ alpha
+ “\tProgress: “
+ (int) (wordCountActual
/ (double) (trainWordsCount + 1) * 100)
+ “%“);
wordCountActual += wordCount - lastWordCount;
lastWordCount = wordCount;
alpha = startingAlpha
* (1 - wordCountActual
/ (double) (trainWordsCount + 1));
if (alpha < startingAlpha * 0.0001) {
alpha = startingAlpha * 0.0001;
}
}
String[] strs = temp.split(“ “);
wordCount += strs.length;
List sentence = new ArrayList();
for (int i = 0; i < strs.length; i++) {
Neuron entry = wordMap.get(strs[i]);
if (entry == null) {
continue;
}
// The subsampling randomly discards frequent words while
// keeping the ranking same
if (sample > 0) {
double ran = (Math.sqrt(entry.freq
/ (sample * trainWordsCount)) + 1)
* (sample * trainWordsCount) / entry.freq;
nextRandom = nextRandom * 25214903917L + 11;
if (ran < (nextRandom & 0xFFFF) / (d

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2015-07-23 07:26  doc2vec_java-master\
     文件         471  2015-07-23 07:26  doc2vec_java-master\.classpath
     文件         371  2015-07-23 07:26  doc2vec_java-master\.project
     目录           0  2015-07-23 07:26  doc2vec_java-master\.settings\
     文件         658  2015-07-23 07:26  doc2vec_java-master\.settings\org.eclipse.jdt.core.prefs
     文件         647  2015-07-23 07:26  doc2vec_java-master\README.md
     目录           0  2015-07-23 07:26  doc2vec_java-master\bin\
     目录           0  2015-07-23 07:26  doc2vec_java-master\bin\com\
     目录           0  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\
     目录           0  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\
     文件       11613  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\Learn.class
     文件       11686  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\LearnDocVec.class
     文件        9784  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\Word2VEC.class
     目录           0  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\domain\
     文件         389  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\domain\HiddenNeuron.class
     文件         718  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\domain\Neuron.class
     文件        1207  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\domain\WordEntry.class
     文件        1610  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\domain\WordNeuron.class
     目录           0  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\util\
     文件        1475  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\util\Haffman.class
     文件        2818  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\util\ModelFile.class
     文件        1050  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\util\ReadWriteFile.class
     文件        1516  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans$Classes$1.class
     文件        3258  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans$Classes.class
     文件        4510  2015-07-23 07:26  doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans.class
     目录           0  2015-07-23 07:26  doc2vec_java-master\bin\test\
     文件        2937  2015-07-23 07:26  doc2vec_java-master\bin\test\Doc2VecTest.class
     文件        1484  2015-07-23 07:26  doc2vec_java-master\bin\test\Word2VecTest.class
     目录           0  2015-07-23 07:26  doc2vec_java-master\file\
     文件     7680759  2015-07-23 07:26  doc2vec_java-master\file\amazon_docs.txt
     文件    16492176  2015-07-23 07:26  doc2vec_java-master\file\clinical_doc_200_java.vec
............此处省略27个文件信息

评论

共有 条评论

相关资源