资源简介
DOC2vec,是为一群用来产生词向量的相关模型。这些模型为浅而双层的神经网络,用来训练以重新建构语言学之词文本。
代码片段和文件信息
package com.ansj.vec;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import love.cq.util.MapCount;
import com.ansj.vec.domain.HiddenNeuron;
import com.ansj.vec.domain.Neuron;
import com.ansj.vec.domain.WordNeuron;
import com.ansj.vec.util.Haffman;
public class Learn {
private Map wordMap = new HashMap<>();
/**
* 训练多少个特征
*/
private int layerSize = 200;
/**
* 上下文窗口大小
*/
private int window = 5;
private double sample = 1e-3;
private double alpha = 0.025;
private double startingAlpha = alpha;
public int EXP_TABLE_SIZE = 1000;
private Boolean isCbow = false;
private double[] expTable = new double[EXP_TABLE_SIZE];
private int trainWordsCount = 0;
private int MAX_EXP = 6;
private int freqThresold = 5;
public Learn(Boolean isCbow Integer layerSize Integer window
Double alpha Double sample) {
createExpTable();
if (isCbow != null) {
this.isCbow = isCbow;
}
if (layerSize != null)
this.layerSize = layerSize;
if (window != null)
this.window = window;
if (alpha != null)
this.alpha = alpha;
if (sample != null)
this.sample = sample;
}
public Learn() throws IOException {
createExpTable();
}
/**
* trainModel
*
* @throws IOException
*/
private void trainModel(File file) throws IOException {
try (BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(file)))) {
String temp = null;
long nextRandom = 5;
int wordCount = 0;
int lastWordCount = 0;
int wordCountActual = 0;
while ((temp = br.readLine()) != null) {
if (wordCount - lastWordCount > 10000) {
System.out.println(“alpha:“
+ alpha
+ “\tProgress: “
+ (int) (wordCountActual
/ (double) (trainWordsCount + 1) * 100)
+ “%“);
wordCountActual += wordCount - lastWordCount;
lastWordCount = wordCount;
alpha = startingAlpha
* (1 - wordCountActual
/ (double) (trainWordsCount + 1));
if (alpha < startingAlpha * 0.0001) {
alpha = startingAlpha * 0.0001;
}
}
String[] strs = temp.split(“ “);
wordCount += strs.length;
List sentence = new ArrayList();
for (int i = 0; i < strs.length; i++) {
Neuron entry = wordMap.get(strs[i]);
if (entry == null) {
continue;
}
// The subsampling randomly discards frequent words while
// keeping the ranking same
if (sample > 0) {
double ran = (Math.sqrt(entry.freq
/ (sample * trainWordsCount)) + 1)
* (sample * trainWordsCount) / entry.freq;
nextRandom = nextRandom * 25214903917L + 11;
if (ran < (nextRandom & 0xFFFF) / (d
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2015-07-23 07:26 doc2vec_java-master\
文件 471 2015-07-23 07:26 doc2vec_java-master\.classpath
文件 371 2015-07-23 07:26 doc2vec_java-master\.project
目录 0 2015-07-23 07:26 doc2vec_java-master\.settings\
文件 658 2015-07-23 07:26 doc2vec_java-master\.settings\org.eclipse.jdt.core.prefs
文件 647 2015-07-23 07:26 doc2vec_java-master\README.md
目录 0 2015-07-23 07:26 doc2vec_java-master\bin\
目录 0 2015-07-23 07:26 doc2vec_java-master\bin\com\
目录 0 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\
目录 0 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\
文件 11613 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\Learn.class
文件 11686 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\LearnDocVec.class
文件 9784 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\Word2VEC.class
目录 0 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\domain\
文件 389 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\domain\HiddenNeuron.class
文件 718 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\domain\Neuron.class
文件 1207 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\domain\WordEntry.class
文件 1610 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\domain\WordNeuron.class
目录 0 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\util\
文件 1475 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\util\Haffman.class
文件 2818 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\util\ModelFile.class
文件 1050 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\util\ReadWriteFile.class
文件 1516 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans$Classes$1.class
文件 3258 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans$Classes.class
文件 4510 2015-07-23 07:26 doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans.class
目录 0 2015-07-23 07:26 doc2vec_java-master\bin\test\
文件 2937 2015-07-23 07:26 doc2vec_java-master\bin\test\Doc2VecTest.class
文件 1484 2015-07-23 07:26 doc2vec_java-master\bin\test\Word2VecTest.class
目录 0 2015-07-23 07:26 doc2vec_java-master\file\
文件 7680759 2015-07-23 07:26 doc2vec_java-master\file\amazon_docs.txt
文件 16492176 2015-07-23 07:26 doc2vec_java-master\file\clinical_doc_200_java.vec
............此处省略27个文件信息
- 上一篇:简单的即时便签
- 下一篇:javaweb课程大作业——教务管理系统
评论
共有 条评论