中文分词+关键字提取

大小: 2.6MB

文件类型: .zip

金币: 2

下载: 0 次

发布日期: 2023-08-16
语言: 其他
标签: 中文分词

高速下载

资源简介

1：中文分词 2：词频统计 3：罗列出要自动提取的关键字 ---------------------------------------- 具有60 万字/秒的高速处理能力。

资源截图

小图大图

代码片段和文件信息

package com.market.common.paoding;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.LockObtainFailedException;
import org.wltea.analyzer.lucene.IKQueryParser;
/**
 * 自动提取关键字
 * 步骤：
 * 1：使用中文分词技术，得到所以词组
 * 2：根据每个词组在文章中检索出现的次数
 * 3：采用HashMap进行词频统计（注意：HashMap效率不是很高.网上有很多词频统计算法，这里不一一列举）
 * @author zhugf 2010-04-13
 *
 */
public class IKAnalyzer {
	static Map map =new HashMap（）;
	
	public static String getKeyword（String title）{
		String keyword=““;
		try {
			//使用IKQueryParser查询分析器构造Query对象
			Query query1 = IKQueryParser.parse（““ title）;
			String str=query1.toString（）;
			str=str.replace（“ “ “+“）;
			str=str.replace（“（“ ““）;
			str=str.replace（“）“ ““）;
			str=str.replace（“++“ “+“）;
			String[] words=str.split（“\\+“）;/** \\+ **/
			for（int j=0;j				if（words[j].length（）>1）{
					keyword=keyword+words[j]+““;
					//查找字符串key在words中出现的次数的程序
					getCount（titlewords[j]）;
					//System.out.println（words[j]）;
				}
			}
			return keyword;
		} catch （CorruptIndexException e） {
			e.printStackTrace（）;
		} catch （LockObtainFailedException e） {
			e.printStackTrace（）;
		} catch （IOException e） {
			e.printStackTrace（）;
		}
		return keyword;
	}
	
	public static int getCount（String wordsString key）{
        int count=0;
        int temp;
        if（words.length（）>=key.length（））{
            for（int i=0;i<=words.length（）;i++）{
                temp=words.indexOf（key）;
                if（temp>=0）{
                    ++count;
                    words=words.substring（temp+1）;
                }
            }
        }
        map.put（keycount）;
        return count;
    }
	
	public static String sort（）{
		String keys=““;
		List> infoIds = new ArrayList>（map.entrySet（））;    
		//排序前    
		for （int i = 0; i < infoIds.size（）; i++） {    
			String id = infoIds.get（i）.toString（）;    
			//System.out.println（id）;    
		}    
		//排序    
		Collections.sort（infoIds new Comparator>（） {    
		public int compare（Map.Entry o1    
			Map.Entry o2） {    
				return （o2.getValue（） - o1.getValue（））;    
			}    
			}）;    
		//排序后    
		for （int i = 0; i < infoIds.size（）; i++） {    
			//String id = infoIds.get（i）.toString（）;    
			String id = infoIds.get（i）.getKey（）;
			keys=keys+id+““;
			//System.out.println（id）;    
		}  
		return keys;
	}
	
	public static void main（String[] args） {
		//----中文分词----
		String words=getKeyword（“2010年3月25日，团团城市之窗品牌易式形态系列在团团网全面上线，敬请关注！！“）;
		System.out.println（“----------关键字------------“）;
		System.out.println（words

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2010-04-13 10:47  中文分词+关键字提取\
     文件      125860  2010-04-13 10:45  中文分词+关键字提取\（1）功能使用手册.pdf
     文件      441273  2009-12-06 22:07  中文分词+关键字提取\（2）IKAnalyzer中文分词器V3.2.0使用手册.pdf
     文件         153  2009-09-22 10:54  中文分词+关键字提取\ext_stopword.dic
     文件         479  2009-09-22 11:37  中文分词+关键字提取\IKAnalyzer.cfg.xml
     文件        3460  2010-04-13 10:36  中文分词+关键字提取\IKAnalyzer.java
     文件     1166203  2009-12-08 15:47  中文分词+关键字提取\IKAnalyzer3.2.0Stable.jar
     文件        3508  2010-04-13 10:36  中文分词+关键字提取\IKAnalyzerDemo.java
     文件     1109923  2010-02-21 23:51  中文分词+关键字提取\lucene-core-2.9.2.jar

共有条评论

中文分词+关键字提取

资源简介

资源截图

代码片段和文件信息

评论

相关资源