java实现的中文词性标注算法

大小: 662KB

文件类型: .rar

金币: 2

下载: 0 次

发布日期: 2021-06-06
语言: Java
标签: 中文 词性标注 java

高速下载

资源简介

java语言实现的关于中文词性标注的问题，在Eclipse上通过编译，可运行。欢迎下载，并提出意见。

资源截图

小图大图

代码片段和文件信息

import java.util.*;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.math.*;

public class Viterbi
{
	public static void main（String[] args）
	{
		
		//----------------------------------------------------------------------------------
		//统计出训练样本中词性种类及其频率
	  String content=““;
	  BufferedReader reader=null;
	  try{  //读取199801train.txt文本中的内容，并保存在content的字符流中
		    reader=new BufferedReader（new FileReader（“c:/199801train.txt“））;
		    String line;
		    while（（line=reader.readLine（））!=null） content+=line;
		   }
	  catch（IOException e）
	  {
		   e.printStackTrace（）;
	  }
	  finally
	  {
		   if（reader!=null）
       {
    	   try{reader.close（）;}   	   
    	   catch（IOException e）{}
       }
	  }
	  
	  String[] text;  //text[]用于存储训练样本中的词语
	  text=content.split（“（/[a-z]*\\s{0}）|（][a-z]*\\s{1}）“）; //去除词性标注
	  //for（String wd:text）
	    //System.out.println（wd）;
	   
	    
	  String[] temp;  //temp[]数组用于存储单个词的词性标注符号
    temp=content.split（“[0-9|-]*/|\\s{1}[^a-z]*|][a-z]“）; //仅保留词性标注符号。
    String[] temp1;
    temp1=new String[temp.length-1];//去除temp[0]为空的情况
    for（int i=0;i      temp1[i]=temp[i+1];	
    //for（String wd:temp1）
      //System.out.print（wd+“  “）;
      
    String[] temp2;  //temp2[]数组用于存储每两个词的词性标注符号
    temp2=new String[temp1.length-1];
    for（int i=0;i      temp2[i]=temp1[i]+‘‘+temp1[i+1];    
    //for（String wd:temp2）
      //System.out.println（wd）;
      
    String[] word_pos;
    word_pos=new String[text.length];
    for（int i=0;i      word_pos[i]=text[i]+‘‘+temp1[i];
    //for（String wd:word_pos）
      //System.out.println（wd）;
         
      
    Hashtable hash1=new Hashtable（）;  //创建hash1，存储单个词的词性及其频率
    for（String wd:temp1）
    {
    	if（hash1.containsKey（wd））
    	  hash1.put（wdhash1.get（wd）.hashCode（）+1）;
    	else
    	  hash1.put（wd1）;    	      	  
    }
    int sp=hash1.size（）;  //统计词性个数
      //System.out.println（hash1）;
    
    Hashtable hash2=new Hashtable（）;  //创建hash2，存储每两个词的词性及其频率
    for（String wd:temp2）
    {
    	if（hash2.containsKey（wd））
    	  hash2.put（wdhash2.get（wd）.hashCode（）+1）;
    	else
    	  hash2.put（wd1）;
    }
    //System.out.println（hash2）;    
   
   Hashtable hash3=new Hashtable（）;  //创建hash3存储词语、词性和词频
   for（String wd:word_pos）
   {
   	if（hash3.containsKey（wd））
   	  hash3.put（wdhash3.get（wd）.hashCode（）+1）;
   	else
   	  hash3.put（wd1）;
   }
   //System.out.println（hash3）;
   
    String[] table_pos;  //table_pos[]用于存储所有不同的词性符号
    table_pos=new String[sp];
    Enumeration key=hash1.keys（）;    
    for（int i=0;i    {       
      String str=（String）key.nextElement（）;
      table_pos[i]=str;
    }
    //for（String wd:table_pos）
       //System.out.println（wd）;
          
   
   //-------------------------------------------------

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件      22374  2009-10-27 16:57  Wordpos\199801test.txt

     文件    2437276  2009-10-27 16:57  Wordpos\199801train.txt

     文件      22320  2009-12-18 16:59  Wordpos\result.txt

     文件       8894  2009-12-18 16:55  Wordpos\Viterbi.java

     文件      16953  2010-01-06 13:46  Wordpos\程序说明.docx

     目录          0  2010-01-06 13:46  Wordpos

----------- ---------  ---------- -----  ----

              2507817                    6

上一篇：剑指offer（java版）.pdf
下一篇：Java AES文件和文本加解密

共有条评论

java实现的中文词性标注算法

资源简介

资源截图

代码片段和文件信息

评论

相关资源