资源简介
利用DOM-TREE模型对网页进行表示
对原始网页进行修正缺省标签的补充等
利用网页正文提取方法对网页进行正文提取,去除网页中的噪声信息,提取出网页中的正文、相关超链接

代码片段和文件信息
using System;
using System.IO;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using mshtml;
using System.Runtime.InteropServices;
[ComVisible(true) ComImport() Guid(“7FD52380-4E07-101B-AE2D-08002B2EC713“) InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
void GetClassID([In Out] ref Guid pClassID);
[return: MarshalAs(UnmanagedType.I4)]
[PreserveSig]
int IsDirty();
void Load([In MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
void Save([In MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm
[In MarshalAs(UnmanagedType.I4)] int fClearDirty);
void GetSizeMax([Out MarshalAs(UnmanagedType.LPArray)] long pcbSize);
void InitNew();
}
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
webBrowser1.Navigate(“HttpstyleUriParser://www.baidu.com“);
}
private void webBrowser1_DocumentCompleted(object sender WebBrowserDocumentCompletedEventArgs e)
{
}
private void treeView1_AfterSelect(object sender TreeViewEventArgs e)
{
}
private void button1_Click(object sender EventArgs e)
{
if (webBrowser1.Document != null)
{ //获取html
StreamReader sr = new StreamReader(webBrowser1.DocumentStream Encoding.GetEncoding(“gb2312“));
String html = sr.ReadToEnd();
richTextBox1.Text = html;
//获取dom树
IHTMLDocument2 doc2 = Parse(html);
IHTMLDocument3 htmldocument = (IHTMLDocument3)doc2;
IHTMLDOMNode rootDomNode = (IHTMLDOMNode)htmldocument.documentElement; //获取Dom树
TreeNode root = treeView1.Nodes.Add(“HTML“); //跟节点
InsertDOMNodes(rootDomNode root); //把其他节点插入到跟节点中
}
else
{
MessageBox.Show(“webbrowser为空“);
}
}
unsafe IHTMLDocument2 Parse(string s) //unsafe关键字表示不安全上下文,该上下文是任何涉及指针的操作所必需的。
{
IHTMLDocument2 pDocument = new HTMLDocumentClass();
if (pDocument != null)
{
IPersistStreamInit pPersist = pDocument as IPersistStreamInit; //as运算符类似于强制转换操作;如果转换不可行,as会返回null而不是引发异常。
pPersist.InitNew();
pPersist = null;
IMarkupServices ms = pDocument as IMarkupServices;
if (ms != null)
{
IMarkupContainer pMC = null;
IMarkupPointer pStart pEnd;
ms.CreateMarkupPointer(out pStart);
ms.CreateMarkupPointer(o
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 12288 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.exe
文件 28160 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.pdb
文件 14328 2010-11-04 11:25 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe
文件 490 2007-07-21 01:33 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe.manifest
文件 5357 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\Form1.cs
文件 3966 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\Form1.Designer.cs
文件 5814 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\Form1.resx
文件 1387 2010-11-04 11:25 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.FileListAbsolute.txt
文件 847 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.GenerateResource.Cache
文件 12288 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.exe
文件 180 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Form1.resources
文件 28160 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.pdb
文件 180 2010-11-04 11:21 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Properties.Resources.resources
文件 516 2010-11-04 11:01 WindowsFormsApplication1\WindowsFormsApplication1\Program.cs
文件 1466 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\AssemblyInfo.cs
文件 2877 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.Designer.cs
文件 5612 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.resx
文件 1109 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.Designer.cs
文件 249 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.settings
文件 4162 2010-11-04 11:20 WindowsFormsApplication1\WindowsFormsApplication1\WindowsFormsApplication1.csproj
文件 12288 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1.exe
文件 962 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1.sln
..A..H. 18944 2010-11-04 11:25 WindowsFormsApplication1\WindowsFormsApplication1.suo
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\TempPE
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\bin
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\obj
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\Properties
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1
............此处省略4个文件信息
相关资源
- WPF USB 网络 串口 通信软件
- B/S 网上订餐系统
- 教室管理系统.rar
- 小鸡快跑游戏.
-
分别适用于.NET fr
amework 2.0和4.0的E - 汽车租赁系统............................
- 德卡D8读写器关于读写感应卡的一些代
- halcon 测量助手
- 图片存储到数据库保存二进制文件并
- 用Socket写的简易FTP服务器和客户端
- 企业销售管理信息系统(全套)
- 串口操作类(justinio)
- 基于Petri网的工作流(完整的原创源代
- 选择题自动考试系统
- 多线程实例:桌面智能弹球小游戏
- 土地信息管理系统
- ServiceStack V3.9 全部dll
- PDF pdfview.ocx 无水印
- 无需共享打印机实现远程打印功能小
- 真正的破解版PDFView4NET
- 网页调用ActiveX控件获取串口数据
- Luence的与盘古分词的使用软件
- Emgu.CV 打开视频与人脸检测
- 麦克纳姆轮程序.rar
- Unity3D 实战视频教程 保卫萝卜 2D 游戏
- net微信支付
- lucene.net+盘古分词多条件全文检索并匹
- 闪电猫-电商下图助手5.0.zip
- FastReport.Net V2014.4.8 For .Net2.0
- NET Reflector 8.3破解版自带代码导出
评论
共有 条评论