tfidf 采用TFIDF自动对文本进行形式化《WEB数据挖掘与知识发现》试验报告实现程序

大小: 0.06M

文件类型: .rar

金币: 1

下载: 0 次

发布日期: 2021-02-01
标签:

高速下载

资源简介

《WEB数据挖掘与知识发现》试验报告实现程序

【核心代码】

/************************************************
* 《WEB数据挖掘与知识发现》试验报告实现程序     *
* 功能：采用TFIDF自动对文本进行形式化（题目6）  *
*             *
* 时间：2008.2.28                               *
************************************************/

#include<stdio.h>
#include<string.h>
#include<malloc.h>
#include<math.h>

#define FNUM  20   //使用的文件总个数

struct Ttree
{
	char data[20];
	double weight;
	double num;           //一篇文献中的某一索引词出现的次数
	double max;           //一篇文献的总字数
	double n;             //索引词出现在几个文档中
	struct Ttree *lchild; //左儿子
	struct Ttree *rchild; //右儿子
};

struct Ttree *rootW=NULL;
struct Ttree *mtree=NULL,*ntree=NULL,*rtree=NULL;  //定义weight权值排序函数中间变量****3月2日增加


FILE *fp=fopen("mm.txt","w");

//创建二叉树用来存放单词，以及该词在文档中出现的次数
Ttree *createTtree(Ttree *root,FILE *fp){
	int i=0,t=0;
	struct Ttree *p,*q;              //定义中间指针变量
	char ch;
	p=(Ttree*)malloc(sizeof(Ttree)); //申请新的存储空间
	p->data[0]='\0';
	p->max=0;                        //**************3月1日增加
	if(fp==NULL)
	{
		printf("\nCannot open file strike any key exit!");
		return NULL;
	}
	ch=fgetc(fp);
	while((ch!=EOF)&&(t==0))
	{ 
		if((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')){
			if(ch<='Z') ch=ch 32;
			p->data[i]=ch;
			i  ;
		}
		else
		{
			if(p->data[0]=='\0'){
				ch=fgetc(fp);
				continue;
			}
			p->data[i]='\0';
			p->max  ;
			p->n=1;
			p->num=1;
			i=0;
			t=1;
			p->lchild=NULL;
			p->rchild=NULL;           //初始化头节点的左右儿子为空指针
			root=p;
		}
		ch=fgetc(fp);
	}
    q=(Ttree*)malloc(sizeof(Ttree));
    q->data[0]='\0' ;
	while(ch!=EOF){
		if( (ch>='a'&&ch<='z') || (ch>='A'&&ch<='Z') ) {
			if(ch<='Z') ch=ch 32;
			q->data[i]=ch;
			i  ;
			ch=fgetc(fp);
		}
        else{
			if(q->data[0]=='\0')
			{
				ch=fgetc(fp);
				continue;
			}
			q->data[i]='\0';
			root->max  ;
			q->n=1;
			q->num=1;
			i=0;
			q->lchild=NULL;
			q->rchild=NULL;                    //初始化头节点的左右儿子为空指针
			if(p==NULL)p=root;
			ch=fgetc(fp);
			while(p!=NULL)                     //寻找待插入节点的位置
			{
				if(strcmp(q->data,p->data)<0){ //如果待插入的节点的值小于当前节点的值，
					if(p->lchild==NULL)        //且其左子树为空
					{
						p->lchild=q;           //  则插入
						p=NULL;
					}                          //并置当前节点为空，退出当前的while循环
					else
						p=p->lchild;
				} // 否则继续访问其左子树
				else if(strcmp(q->data,p->data)>0){ //如果待插入的节点的值大于当前节点的值
					if(p->rchild==NULL)             // 且其右子树为空
					{
						p->rchild=q;                //  则插入
						p=NULL;
					} //并置当前节点为空，退出当前的while循环
					else
						p=p->rchild;
				} // 否则继续访问其右子树
				else{
					p->num  ;
					p=NULL;
				}
			}//while
            q=(Ttree*)malloc(sizeof(Ttree));
            q->data[0]='\0';
		}//else
	}//while
	return root;
}

/*
二叉树查找
计算某个词在几篇文档中出现
*/
Ttree *SearchBinTtree(Ttree *rootx,Ttree *rooty){
	if(rootx==NULL) return NULL;
	
	if(strcmp(rootx->data,rooty->data)==0){
		rooty->n  ;
		return rootx;
	}
	
	if(strcmp(rootx->data,rooty->data)>0) return SearchBinTtree(rootx->lchild,rooty);
	
	return SearchBinTtree(rootx->rchild,rooty);
}

/*计算词出现在几个文档中*/
void InMidThread(Ttree *rooty,Ttree *rootx){
    if(rooty==NULL) return;
    InMidThread(rooty->lchild,rootx);  //中序遍历二叉树左子树 ;
    SearchBinTtree(rootx,rooty);
    InMidThread(rooty->rchild,rootx);  //中序遍历二叉树右子树 ;
}

/*计算权值*/
void InThread(Ttree *root,Ttree *Mroot){
    if(root==NULL) return;
    InThread(root->lchild,Mroot);      //中序遍历二叉树左子树 ;
    root->weight=(root->num/Mroot->max)*log(FNUM/root->n);
    InThread(root->rchild,Mroot);       //中序遍历二叉树右子树 ;
}

//对权值进行排序      
/*******3月2日对该函数进行修正，纠正建树过程中遗漏分支的错误*/
void weight(Ttree *root){
    if(root==NULL) return;
    weight(root->lchild);                //中序遍历二叉树左子树 ;
	if (rtree==NULL)
	{
		mtree=(Ttree*)malloc(sizeof(Ttree)); //申请新的存储空间
		for(int i=0;i<20;i  )
			mtree->data[i] = root->data[i];
		mtree->weight=root->weight;
		mtree->num=root->num;
		mtree->n=root->n;
		mtree->lchild=NULL;
		mtree->rchild=NULL;                 //初始化头节点的左右儿子为空指针
		rootW=mtree;                        //指针rootW指向头节点
		rtree=mtree;
	}
	else
	{
		ntree=(Ttree*)malloc(sizeof(Ttree));
        for (int i=0;i<20;i  )
			ntree->data[i] = root->data[i];
		ntree->weight=root->weight;
		ntree->num=root->num;
		ntree->n=root->n;
		ntree->lchild=NULL;
		ntree->rchild=NULL;                 //初始化头节点的左右儿子为空指针
		if(mtree==NULL) mtree=rootW;            //如果要有新节点插入则，m重新指向根节点，因为 每次比较都要从根节点开始
        while(mtree!=NULL)                  //寻找待插入节点的位置
		{
			if (ntree->weight>mtree->weight)
			{
				if(mtree->lchild==NULL)     // 且其左子树为空
                {
					mtree->lchild=ntree;        //  则插入
					mtree=NULL;             //并置当前节点为空，退出当前的while循环
				}
				else
					mtree=mtree->lchild;         // 否则继续访问其左子树
			}                       
			
			else 
			{                            //如果待插入的节点的值大于当前节点的值
				if(mtree->rchild==NULL)      // 且其右子树为空
				{
					mtree->rchild=ntree;         //  则插入
					mtree=NULL;         
				} //并置当前节点为空，退出当前的while循环
				else 
					mtree=mtree->rchild;             // 否则继续访问其右子树
			} 
		}//while
	}//else
    weight(root->rchild); 
}


//判断某词是否在词汇集中*********3月2日增加
bool in(char data[20])  
{
	char ch,temp[20];
	int k=0;
	FILE *fp=fopen("text\\vo.txt","r");
	if(fp==NULL)
    {
		//printf("\nCannot open file strike any key exit!");
		return true;  //若术语集不存在,则输出所有词汇的权值,故这里设置为true
    }
    ch=fgetc(fp);
    while((ch!=EOF))
	{
		while((ch!='\n')&&(ch!=EOF))
		{
			if((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z'))
			{
				if(ch<='Z') ch=ch 32;
                temp[k]=ch;
                k  ;
				ch=fgetc(fp);
			}
		}
		temp[k]='\0';
		k=0;
		ch=fgetc(fp);
		if (!strcmp(temp,data))
		{
			fclose(fp);
			return true;
		}
	}
	fclose(fp);
	return false;
}

//输出权值
void ThreadWeight(Ttree *root){
    if(root==NULL) return;
    ThreadWeight(root->lchild);  //中序遍历二叉树左子树 ;
    if (in(root->data))
		fprintf(fp,"%30s\t%10.6f\t%6d\t%6d\n",root->data,root->weight,(int)(root->num),(int)(root->n));
    ThreadWeight(root->rchild);  //中序遍历二叉树右子树 ;
}

int main(int argc, char* argv[])
{	
	Ttree *root[ FNUM ];
	int i,j;
	
	__try
	{
		char *Help=argv[1];
		if (!Help==NULL)
		{
			printf("Help!\n\nNote:Please put txts into the folder \"text\",\nand edit terms in text\\vo.txt.\nRun InformationRetrieval.exe without parameter,you will get the result in mm.txt\nIf you deltet vo.txt,the result will be all terms.\n");
			return 0;
		}
		
		//读文件并统计词频
		for(i=0;i<FNUM;i  )
		{	
			FILE *fr;	
			char rFileName[64];
			sprintf(rFileName,"text\\P%02d.txt",i);
			printf("%s\n",rFileName);
			fr=fopen(rFileName,"r");
			root[i]=createTtree(root[i],fr);
			fclose(fr);
		}
		
		//文件间相互轮对，计算词出现在几个文档中
		for(i=0;i<FNUM;i  )
		{
			for(j=0;j<FNUM;j  )
			{
				int next = (j 1)%FNUM;
				if(next==i)	continue;
				InMidThread(root[i],root[next]);
			}
		}
		fprintf(fp,"%s\n\n","注:词出现次数--词在该文档中出现次数， 文档频率--词在几篇文档中出现。");
		fprintf(fp,"%17s\t%6s\t%10s\t%6s%9s\n","总词数","词汇","权值","词出现次数","文档频率");
		//计算权值排序输出
		for(i=0;i<FNUM;i  )
		{
			rtree=rootW=NULL;  
			InThread(root[i],root[i]);
			fprintf(fp,"第%d篇文档%6d\n",i,int(root[i]->max));
			weight(root[i]);
			ThreadWeight(rootW);
		}
		
		fcloseall();
	}
	__except(1)
	{
		printf("Error occurring in the course of geting mail!\nProgram exit exceptionally!\n");
		return -1;
	}
	return 1;
}

资源截图

小图大图

代码片段和文件信息

/************************************************
* 《WEB数据挖掘与知识发现》试验报告实现程序     *
* 功能：采用TFIDF自动对文本进行形式化（题目6）  *
*             *
* 时间：2008.2.28                               *
************************************************/

#include
#include
#include
#include

#define FNUM  20   //使用的文件总个数

struct Ttree
{
	char data[20];
	double weight;
	double num;           //一篇文献中的某一索引词出现的次数
	double max;           //一篇文献的总字数
	double n;             //索引词出现在几个文档中
	struct Ttree *lchild; //左儿子
	struct Ttree *rchild; //右儿子
};

struct Ttree *rootW=NULL;
struct Ttree *mtree=NULL*ntree=NULL*rtree=NULL;  //定义weight权值排序函数中间变量****3月2日增加


FILE *fp=fopen（“mm.txt““w“）;

//创建二叉树用来存放单词，以及该词在文档中出现的次数
Ttree *createTtree（Ttree *rootFILE *fp）{
	in

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件       8225  2008-06-25 11:29  tfidfsrc\InformationRetrieval.cpp

     文件       3569  2008-03-02 15:46  tfidfsrc\InformationRetrieval.dsp

     文件        563  2008-03-02 15:48  tfidfsrc\InformationRetrieval.dsw

     文件      53248  2008-03-04 15:42  tfidfsrc\InformationRetrieval.exe

     文件       2531  2008-06-25 11:30  tfidfsrc\mm.txt

     文件       4221  2008-02-27 23:39  tfidfsrc\text\P00.txt

     文件       4546  2008-02-27 23:40  tfidfsrc\text\P01.txt

     文件       3685  2008-02-28 14:49  tfidfsrc\text\P02.txt

     文件       2465  2008-02-28 14:50  tfidfsrc\text\P03.txt

     文件       3479  2008-02-28 14:51  tfidfsrc\text\P04.txt

     文件       3973  2008-02-28 14:55  tfidfsrc\text\P05.txt

     文件       1898  2008-02-28 14:56  tfidfsrc\text\P06.txt

     文件       4461  2008-02-28 14:56  tfidfsrc\text\P07.txt

     文件       3412  2008-02-28 14:57  tfidfsrc\text\P08.txt

     文件       3942  2008-02-28 14:58  tfidfsrc\text\P09.txt

     文件       3487  2008-02-28 14:59  tfidfsrc\text\P10.txt

     文件       4055  2008-02-28 15:00  tfidfsrc\text\P11.txt

     文件       4666  2008-02-28 15:01  tfidfsrc\text\P12.txt

     文件       4986  2008-02-28 15:01  tfidfsrc\text\P13.txt

     文件       4903  2008-02-28 15:02  tfidfsrc\text\P14.txt

     文件       4020  2008-02-28 15:03  tfidfsrc\text\P15.txt

     文件       3599  2008-02-28 15:04  tfidfsrc\text\P16.txt

     文件       4565  2008-02-28 15:04  tfidfsrc\text\P17.txt

     文件       4915  2008-02-28 15:05  tfidfsrc\text\P18.txt

     文件       3785  2008-02-28 15:07  tfidfsrc\text\P19.txt

     文件       3300  2008-02-28 15:09  tfidfsrc\text\P20.txt

     文件         24  2008-03-03 20:04  tfidfsrc\text\vo.txt

     目录          0  2008-03-04 14:49  tfidfsrc\text

     目录          0  2008-06-25 11:30  tfidfsrc

----------- ---------  ---------- -----  ----

............此处省略2个文件信息

上一篇：管道使用demo RunDosCommand
下一篇：智商测试（C++坑人版）

共有条评论

tfidf 采用TFIDF自动对文本进行形式化 《WEB数据挖掘与知识发现》试验报告实现程序

资源简介

资源截图

代码片段和文件信息

评论

相关资源

tfidf 采用TFIDF自动对文本进行形式化《WEB数据挖掘与知识发现》试验报告实现程序