-
大小: 18.08MB文件类型: .rar金币: 1下载: 0 次发布日期: 2023-08-02
- 语言: 其他
- 标签: 中文地址 地址分词 地址匹配 Levenshtein
资源简介
处理中文地址的分词和匹配 采用混合分词算法进行中文地址分词 在中文地址分词基础上采用Double Levenshetin算法进行中文地址相似度进行地址匹配
代码片段和文件信息
package experiment;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.HbaseConfiguration;
import com.AddressSegment.data.dao.impl.AddressQueryImpl;
import com.AddressSegment.logic.AddressSplitImpl;
import com.AddressSegment.logic.UndefinedWordRecognize;
import com.AddressSegment.metadata.model.CharDictionary;
import com.AddressSegment.metadata.model.WordDictionary;
import com.AddressSegment.tool.dao.impl.DictionaryFileOperationDAOImpl;
import com.AddressSegment.util.Config;
public class CountAddress {
public static Configuration config = null;
public static FileSystem fs = null;
public static DictionaryFileOperationDAOImpl DF = null;
public static WordDictionary wordDict = null;
public static CharDictionary charDict = null;
public static int rowkey = 0;
// public static HTablePool pool = new HTablePool(config 1000);
static {
config = HbaseConfiguration.create();
wordDict = new WordDictionary();
charDict = new CharDictionary();
Configuration conf = new Configuration();
try {
fs = FileSystem.get(URI.create(“hdfs://192.168.31.172:9000“) conf);
} catch (IOException e1) {
e1.printStackTrace();
}
try {
DF = new DictionaryFileOperationDAOImpl(Config.getDefaultDictionaryHDFSURL()
Config.getCharDictionaryHDFSURL() fs);
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
DF.putFileToDict(wordDict charDict);
}
public static void ComputeAddressCount(String fileInputPath String fileOutputPath) throws IOException URISyntaxException{
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileInputPath)“GBK“));
String line = ““;
FileOutputStream out=new FileOutputStream(fileOutputPath);
PrintStream p=new PrintStream(out);
while ((line = br.readLine()) != null) {
Integer count = 0;
AddressSplitImpl asi = new AddressSplitImpl();
ArrayList strArray = asi.Split(line fs);
UndefinedWordRecognize uwr = new UndefinedWordRecognize();
ArrayList wordArray1 = uwr.getUndefinedWord(strArray);
AddressQueryImpl aqi = new AddressQueryImpl();
count = aqi.queryAddressCount(wordArray1);
System.out.println(line);
System.out.println(count);
p.println(line+“\t“+count);
}
p.close();
br.close();
}
public static void main(String[] args) throws IOException URISyntaxException {
ComputeAddressCount(“C:/Users/HYFrank/Desktop/Noname1.txt“ “C:/Users/HYFrank/Desktop/countAddress.txt“);
}
}
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 5930 2016-08-09 22:51 src\com\AddressSegment\data\dao\ba
文件 411 2016-08-01 11:36 src\com\AddressSegment\data\dao\declare\AddressQuery.java
文件 238 2016-01-31 15:22 src\com\AddressSegment\data\dao\declare\SegmentInsert.java
文件 5207 2016-08-09 22:24 src\com\AddressSegment\data\dao\impl\AddressQueryImpl.java
文件 1480 2016-02-25 14:56 src\com\AddressSegment\data\dao\impl\SegmentInsertImpl.java
文件 212 2016-01-29 23:28 src\com\AddressSegment\data\dao\ModelRowMapper.java
文件 814 2016-02-25 23:13 src\com\AddressSegment\logic\AddressEncodingService.java
文件 1020 2016-04-05 00:21 src\com\AddressSegment\logic\AddressSplitImpl.java
文件 3924 2016-08-30 00:37 src\com\AddressSegment\logic\AlgorithmDaoImpl.java
文件 3620 2016-02-26 00:36 src\com\AddressSegment\logic\GaodeEncodingServiceInvoker.java
文件 283 2016-02-25 17:00 src\com\AddressSegment\logic\service\AddressEncoding.java
文件 621 2016-04-05 00:20 src\com\AddressSegment\logic\service\AddressSplit.java
文件 1201 2016-04-12 21:53 src\com\AddressSegment\logic\service\AddressTageMaking.java
文件 219 2016-01-25 21:49 src\com\AddressSegment\logic\service\AlgorithmInterface.java
文件 3418 2016-03-05 14:01 src\com\AddressSegment\logic\service\HttpRequestTemplate.java
文件 258 2016-07-24 23:35 src\com\AddressSegment\logic\service\IHttpResponseHandler.java
文件 271 2016-07-24 23:35 src\com\AddressSegment\logic\service\UndefinedWordRecognizeInterface.java
文件 4771 2016-07-22 09:49 src\com\AddressSegment\logic\UndefinedWordRecognize.java
文件 5195 2016-04-12 23:04 src\com\AddressSegment\main\AddressRegexTage.java
文件 4395 2016-04-09 01:07 src\com\AddressSegment\main\AddressSegment.java
文件 5349 2016-07-07 21:17 src\com\AddressSegment\main\AddressSegmentTage.java
文件 6496 2016-07-20 14:06 src\com\AddressSegment\main\AddressSegmentToHba
文件 5976 2016-07-24 23:26 src\com\AddressSegment\main\AddressSegmentToHDFS.java
文件 6047 2016-07-27 00:45 src\com\AddressSegment\main\AddressSegmentToHDFSToHba
文件 2661 2016-04-07 23:20 src\com\AddressSegment\main\WordCount.java
文件 719 2016-01-24 00:56 src\com\AddressSegment\me
文件 1985 2016-01-23 16:59 src\com\AddressSegment\me
文件 836 2016-01-23 16:59 src\com\AddressSegment\me
文件 407 2016-02-25 16:59 src\com\AddressSegment\me
文件 1179 2016-01-30 22:38 src\com\AddressSegment\me
............此处省略226个文件信息
- 上一篇:爱心银行-漂亮的公益活动网站程序
- 下一篇:普朗特《流体力学概论》
评论
共有 条评论