资源简介
现代信息检索,北京邮电大学,文档、代码。用luncene实现的信息检索,提供索引,支持PPT,word,Excel等格式的索引。
代码片段和文件信息
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License Version 2.0
* (the “License“); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing software
* distributed under the License is distributed on an “AS IS“ BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.FileReader;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.htmlparser.*;
import org.htmlparser.visitors.*;
import org.htmlparser.util.*;
import org.apache.pdfbox.*;
import org.apache.pdfbox.pdmodel.*;
import java.io.*;
import org.apache.pdfbox.util.*;
import org.apache.pdfbox.searchengine.lucene.*;
import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hslf.usermodel.*;
import org.apache.poi.hslf.*;
import org.apache.poi.hslf.model.*;
import org.apache.poi.hssf.usermodel.*;
import java.util.*;
import org.apache.poi.ss.usermodel.*;
/** A utility for making Lucene Documents from a File. */
public class FileDocument {
/** Makes a document for a File.
The document has three fields:
path
--containing the pathname of the file as a stored
untokenized field;
modified
--containing the last modified date of the file as
a field as created by href=“lucene.document.DateTools.html“>DateTools; and
contents
--containing the full contents of the file as a
Reader field;
*/
public static Document Document(File f) throws java.io.
FileNotFoundException {
// make a new empty document
Document doc = new Document();
String[] encoding = {“UTF-8“ “GBK“ “GB2312“ “UTF-8“ “ISO8859_1“};
// Add the path of the file as a field named “path“. Use a field that is
// indexed (i.e. searchable) but don‘t tokenize the field into words.
doc.add(new Field(“path“ f.getPath() Field.Store.YES
Field.Index.NOT_ANALYZED));
// Add the last modified date of the file a field named “modified“. Use
// a field that is indexed (i.e. searchable) but don‘t tokenize the field
// into words.
doc.add(new Field(“modified“
DateTools.timeToString(f.lastModified()
DateTool
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2011-12-26 14:07 检索课程设计上交内容\
目录 0 2011-12-26 14:08 检索课程设计上交内容\可执行程序\
文件 9272335 2009-12-21 23:28 检索课程设计上交内容\可执行程序\InfRetrW2.2.exe
文件 215 2011-12-26 16:57 检索课程设计上交内容\可执行程序\Retrieve.log
目录 0 2009-12-13 11:35 检索课程设计上交内容\源代码\
文件 9743 2009-12-13 09:41 检索课程设计上交内容\源代码\FileDocument.java
文件 2359 2009-12-13 08:35 检索课程设计上交内容\源代码\IndexFiles.java
文件 4064 2009-12-13 11:27 检索课程设计上交内容\源代码\LuceneProc.java
文件 17678 2009-12-13 11:03 检索课程设计上交内容\源代码\Mainfr
文件 1070 2009-12-30 21:23 检索课程设计上交内容\源代码\MyTable.java
目录 0 2009-12-24 10:15 检索课程设计上交内容\设计与说明文档\
文件 52736 2009-12-13 11:27 检索课程设计上交内容\设计与说明文档\使用说明文档.doc
文件 35328 2009-12-24 10:15 检索课程设计上交内容\设计与说明文档\程序设计说明书.doc
文件 67584 2009-10-20 10:06 检索课程设计上交内容\设计与说明文档\课程设计要求.doc
- 上一篇:EBS财务模块基本功能和常用表
- 下一篇:扩频通信 ppt 调制解调 扩频解扩 同步
评论
共有 条评论