资源简介
可以直接提取word里的图片信息,代码有注释,一看就明白
代码片段和文件信息
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.*;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
/**
* 类 MsWordExtractor用来提取Microsoft Word 里面的文字和图片
* 注意提取图片后,可以把图片放在由用户指定的路径下面
*
* @author Zhou Xiaolong
* @email shaolongchou@126.com
*/
public class MsWordExtractor {
private HWPFDocument doc = null;
private Range range = null;
private List pictsList = null;
// 用来标记是否存在图片
boolean hasPic = false;
/**
* 构造器,注意到所传入的参数必须是微软word文档的名字
* @param msDocName
* @throws IOException
* @throws FileNotFoundException
*/
public MsWordExtractor(String msDocName) {
if (msDocName.endsWith(“.doc“)) {
try {
doc = new HWPFDocument(new FileInputStream(msDocName));
range = doc.getRange();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 默认构造器,为私有函数
*
*/
private MsWordExtractor() {
}
/**
* 从word文档中获取所有文字
* @return
*/
public String getAllText() {
int numP = range.numParagraphs();
StringBuffer ret = new StringBuffer();
for (int i = 0; i < numP; ++i) {
//从每一段落中获取文字
Paragraph p = range.getParagraph(i);
ret.append(p.text());
}
return ret.toString();
}
/**
* 从word里面提取图片
* @return
*/
public boolean extractPictures() {
pictsList = new ArrayList();
// 得到文档的数据流
byte[] dataStream = doc.getDataStream();
int numChar = range.numCharacterRuns();
PicturesTable pTable = new PicturesTable(dataStream);
for (int j = 0; j < numC
评论
共有 条评论