资源简介
基于tf idf的文档集关键词提取
已经含有测试文档集
可以替换成任意需要的文档集
可以自己提供字典
代码片段和文件信息
///////////////////////////////////////////////////////////////////
// File :Dir txt Input
// Author :ShuanHolmes
// Date :2015.4.10
// Modifier :...
// Modify Date :...
// Description :statics_Dir.cpp
///////////////////////////////////////////////////////////////////
#include “Statics.h“
extern map< string int > Dic;
extern list< string > SinStatics;
extern multiset< string > SumStatics;
extern multiset< string > Fileidf;
extern set< Word > Database;
list< WordIDF > DataOut;
void getJustCurrentFile( string path vector& files)
{ // return file iter
long hFile = 0; // file info
struct _finddata_t fileinfo;
string p;
if((hFile = _findfirst(p.assign(path).append(“\\*“).c_str()&fileinfo)) != -1)
{
do
{
if((fileinfo.attrib & _A_SUBDIR));
else
files.push_back(fileinfo.name);
}while(_findnext(hFile &fileinfo) == 0);
_findclose(hFile);
}
}
void WordFrequency( void )
{
SinStatics.unique();
while(!SinStatics.empty())
{
SumStatics.insert(SinStatics.back( ));
SinStatics.pop_back( );
}
SinStatics.clear(); // register clear
}
void DatabaseConstruction( float N ) // the file group
{
multiset< string >::iterator it;
Word temp;
for(it = SumStatics.begin(); it != SumStatics.end(); it++ )
{
temp.wordfrequency = fabs(log(N/(float)SumStatics.count(*it))/log(2));
temp.word = *it;
if(Database.find(temp)==Database.end())
{
Database.insert(temp);
}
}
SinStatics.clear();
SumStatics.clear();
Dic.clear();
}
void TfidfFileInput(char *filename) // segment the sentence store the real words
{
ifstream testfile(filename);
string testsentence;
string testword;
if (!testfile)
cerr << “Fail to open “ << filename << endl;
else
cout << “Succeed to open “ << filename << endl;
cout << “Please wait “<< filename << “ segmenting the sentences in test file!“ << endl;
while(!testfile.eof())
{
getline(testfiletestsentence‘\n‘);
string result_temp=““;
int result_len = 0;
string sentence_temp=testsentence;
int cur_sen_length=testsentence.length();
int len1len2;
while(sentence_temp!=““)
{
len1 = sentence_temp.length();
len2 = sentence_temp.length();
if(len2 > MaxWordLength) // MaxLength
len2 = MaxWordLength;
testword = sentence_temp.substr(len1-len2);
bool isw = TFidfWordCheck( testword );
while(len2 > 2 && isw == false)
{
len2 = len2-2; // 2 Byte 1 word
testword = sentence_temp.substr(len1-len2);
isw = TFidfWordCheck( testword );
}
if(result_temp == ““)
result_temp=testword+result_temp; // continue
else
result_temp=testword+“ “+result_temp; // cut
sentence_temp=sentence_temp.substr(0len1-len2); // next sentence
}
}
testfile.close();
}
bool TFidfWordCheck(string test_word) // whether t
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2015-04-10 20:25 Tfidf_Calculate\
文件 4456 2015-05-28 00:17 Tfidf_Calculate\DirInput.cpp
文件 940 2015-05-28 00:18 Tfidf_Calculate\main.cpp
目录 0 2015-04-11 11:46 Tfidf_Calculate\mingw5\
文件 3268 2003-07-21 19:40 Tfidf_Calculate\mingw5\(1).txt
文件 5626 2015-04-15 22:41 Tfidf_Calculate\mingw5\(1)Out.txt
文件 998 2003-07-21 19:40 Tfidf_Calculate\mingw5\(10).txt
文件 1698 2015-04-15 22:41 Tfidf_Calculate\mingw5\(10)Out.txt
文件 1341 2003-07-21 19:40 Tfidf_Calculate\mingw5\(100).txt
文件 2283 2015-04-15 22:41 Tfidf_Calculate\mingw5\(100)Out.txt
文件 699 2003-07-21 19:40 Tfidf_Calculate\mingw5\(101).txt
文件 1241 2015-04-15 22:41 Tfidf_Calculate\mingw5\(101)Out.txt
文件 963 2003-07-21 19:40 Tfidf_Calculate\mingw5\(102).txt
文件 1651 2015-04-15 22:41 Tfidf_Calculate\mingw5\(102)Out.txt
文件 3045 2003-07-21 19:40 Tfidf_Calculate\mingw5\(103).txt
文件 5183 2015-04-15 22:41 Tfidf_Calculate\mingw5\(103)Out.txt
文件 785 2003-07-21 19:40 Tfidf_Calculate\mingw5\(104).txt
文件 1339 2015-04-15 22:41 Tfidf_Calculate\mingw5\(104)Out.txt
文件 814 2003-07-21 19:40 Tfidf_Calculate\mingw5\(105).txt
文件 1442 2015-04-15 22:41 Tfidf_Calculate\mingw5\(105)Out.txt
文件 1190 2003-07-21 19:40 Tfidf_Calculate\mingw5\(106).txt
文件 2168 2015-04-15 22:41 Tfidf_Calculate\mingw5\(106)Out.txt
文件 1265 2003-07-21 19:40 Tfidf_Calculate\mingw5\(107).txt
文件 2209 2015-04-15 22:41 Tfidf_Calculate\mingw5\(107)Out.txt
文件 1157 2003-07-21 19:40 Tfidf_Calculate\mingw5\(108).txt
文件 2001 2015-04-15 22:41 Tfidf_Calculate\mingw5\(108)Out.txt
文件 1195 2003-07-21 19:40 Tfidf_Calculate\mingw5\(109).txt
文件 2011 2015-04-15 22:41 Tfidf_Calculate\mingw5\(109)Out.txt
文件 788 2003-07-21 19:40 Tfidf_Calculate\mingw5\(11).txt
文件 1400 2015-04-15 22:41 Tfidf_Calculate\mingw5\(11)Out.txt
文件 1000 2003-07-21 19:40 Tfidf_Calculate\mingw5\(110).txt
............此处省略308个文件信息
相关资源
- STM32F407+ov5640摄像头在TFTLCD实时显示图
- cscms_v3.5_utf8
- FATFS_ff13a
- dotNetFx45LP_Full_x86_x64zh-Hans.exe
- stm32 多串口同步工作 DMA printf打印字符
- Stm32驱动SDCard移植FATFS文件系统
- IdHTTPServer+utf8转换+json解析
- STM 图片显示TFT 正点原子
- DirectFB-1.4.3.tar.gz
- SmartFoxServer中文教程及API
- GlobalPlatform卡片规范2.2 中文版
- CTFcrackTools-V2.2
- Hadoop Security Protecting Your Big Data Platf
- 轻松搭建Postfix 邮件服务器系统
- stm32f429+FATFS+SD卡项目模版
- SDL_ttf-2.0.10
- MiniGUI库文件之带TTF- libminigui-1.6.10-t
- Tensorflow GNN实战.zip
- FruitFinal.rar
- discuz模板_迪恩moko美空加强版_商业版
- stm32f103c8t6 驱动ili9341 2.8寸TFT LCD液晶显
- FastReport报告模板设计器
- minigui3.0.12全面使用使用ttf字库完美支
- ECShop_V2.7.3_UTF8 完整版
- Google Cloud Platform for Architects Design an
- 1102-0x20000_hg255d-squashfs-tftp.checksum2.bi
- 3DSMAX导出GLTF插件内涵多个版本,安装
- simhei.ttf times.ttf
- The standard for portfolio management 4th edit
- ALIENTEK MINISTM32 实验10 TFTLCD显示实验
评论
共有 条评论