资源简介
只需要把html读出来,放到方法里面,就能得到html的文本,很好的方法,我找了好久,现在发上来
代码片段和文件信息
/*
* File: WebFormatter.java
* Created on 2005-6-24
* Author: Liao Xuefeng asklxf@163.com
* Copyright (C) 2005 Liao Xuefeng.
*/
import java.util.*;
import java.text.SimpleDateFormat;
/**
* Do some format on web display.
*
* @author Xuefeng
*/
public class WebFormatter {
public static String html2text(String html) {
StringBuffer sb = new StringBuffer(html.length());
char[] data = html.toCharArray();
int start = 0;
boolean previousIsPre = false;
Token token = null;
for(;;) {
token = parse(data start previousIsPre);
if(token==null)
break;
previousIsPre = token.isPreTag();
sb = sb.append(token.getText());
start += token.getLength();
}
return sb.toString();
}
private static Token parse(char[] data int start boolean previousIsPre) {
if(start>=data.length)
return null;
// try to read next char:
char c = data[start];
if(c==‘<‘) {
// this is a tag or comment or script:
int end_index = indexOf(data start+1 ‘>‘);
if(end_index==(-1)) {
// the left is all text!
return new Token(Token.TOKEN_TEXT data start data.length previousIsPre);
}
String s = new String(data start end_index-start+1);
// now we got s=“<...>“:
if(s.startsWith(““);
if(end_comment_index==(-1)) {
// illegal end but treat as comment:
return new Token(Token.TOKEN_COMMENT data start data.length previousIsPre);
}
else
return new Token(Token.TOKEN_COMMENT data start end_comment_index+3 previousIsPre);
}
String s_lowerCase = s.toLowerCase();
if(s_lowerCase.startsWith(“ript“)) { // this is a script:
int end_script_index = indexOf(data start+1 “ ript>“);
if(end_script_index==(-1))
// illegal end but treat as script:
return new Token(Token.TOKEN_script data start data.length previousIsPre);
else
return new Token(Token.TOKEN_script data start end_script_index+9 previousIsPre);
}
else { // this is a tag:
return new Token(Token.TOKEN_TAG data start start+s.length() previousIsPre);
}
}
// this is a text:
int next_tag_index = indexOf(data start+1 ‘<‘);
if(next_tag_index==(-1))
return new Token(Token.TOKEN_TEXT data start data.length previousIsPre);
return new Token(Token.TOKEN_TEXT data start next_tag_index previousIsP
评论
共有 条评论