资源简介
利用DOM-TREE模型对网页进行表示
对原始网页进行修正缺省标签的补充等
利用网页正文提取方法对网页进行正文提取,去除网页中的噪声信息,提取出网页中的正文、相关超链接
代码片段和文件信息
using System;
using System.IO;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using mshtml;
using System.Runtime.InteropServices;
[ComVisible(true) ComImport() Guid(“7FD52380-4E07-101B-AE2D-08002B2EC713“) InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
void GetClassID([In Out] ref Guid pClassID);
[return: MarshalAs(UnmanagedType.I4)]
[PreserveSig]
int IsDirty();
void Load([In MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
void Save([In MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm
[In MarshalAs(UnmanagedType.I4)] int fClearDirty);
void GetSizeMax([Out MarshalAs(UnmanagedType.LPArray)] long pcbSize);
void InitNew();
}
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
webBrowser1.Navigate(“HttpstyleUriParser://www.baidu.com“);
}
private void webBrowser1_DocumentCompleted(object sender WebBrowserDocumentCompletedEventArgs e)
{
}
private void treeView1_AfterSelect(object sender TreeViewEventArgs e)
{
}
private void button1_Click(object sender EventArgs e)
{
if (webBrowser1.Document != null)
{ //获取html
StreamReader sr = new StreamReader(webBrowser1.DocumentStream Encoding.GetEncoding(“gb2312“));
String html = sr.ReadToEnd();
richTextBox1.Text = html;
//获取dom树
IHTMLDocument2 doc2 = Parse(html);
IHTMLDocument3 htmldocument = (IHTMLDocument3)doc2;
IHTMLDOMNode rootDomNode = (IHTMLDOMNode)htmldocument.documentElement; //获取Dom树
TreeNode root = treeView1.Nodes.Add(“HTML“); //跟节点
InsertDOMNodes(rootDomNode root); //把其他节点插入到跟节点中
}
else
{
MessageBox.Show(“webbrowser为空“);
}
}
unsafe IHTMLDocument2 Parse(string s) //unsafe关键字表示不安全上下文,该上下文是任何涉及指针的操作所必需的。
{
IHTMLDocument2 pDocument = new HTMLDocumentClass();
if (pDocument != null)
{
IPersistStreamInit pPersist = pDocument as IPersistStreamInit; //as运算符类似于强制转换操作;如果转换不可行,as会返回null而不是引发异常。
pPersist.InitNew();
pPersist = null;
IMarkupServices ms = pDocument as IMarkupServices;
if (ms != null)
{
IMarkupContainer pMC = null;
IMarkupPointer pStart pEnd;
ms.CreateMarkupPointer(out pStart);
ms.CreateMarkupPointer(o
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 12288 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.exe
文件 28160 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.pdb
文件 14328 2010-11-04 11:25 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe
文件 490 2007-07-21 01:33 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe.manifest
文件 5357 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\Form1.cs
文件 3966 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\Form1.Designer.cs
文件 5814 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\Form1.resx
文件 1387 2010-11-04 11:25 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.FileListAbsolute.txt
文件 847 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.GenerateResource.Cache
文件 12288 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.exe
文件 180 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Form1.resources
文件 28160 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.pdb
文件 180 2010-11-04 11:21 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Properties.Resources.resources
文件 516 2010-11-04 11:01 WindowsFormsApplication1\WindowsFormsApplication1\Program.cs
文件 1466 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\AssemblyInfo.cs
文件 2877 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.Designer.cs
文件 5612 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.resx
文件 1109 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.Designer.cs
文件 249 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.settings
文件 4162 2010-11-04 11:20 WindowsFormsApplication1\WindowsFormsApplication1\WindowsFormsApplication1.csproj
文件 12288 2010-11-04 11:24 WindowsFormsApplication1\WindowsFormsApplication1.exe
文件 962 2010-11-04 10:18 WindowsFormsApplication1\WindowsFormsApplication1.sln
..A..H. 18944 2010-11-04 11:25 WindowsFormsApplication1\WindowsFormsApplication1.suo
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\TempPE
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\bin
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\obj
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1\Properties
目录 0 2010-11-04 11:26 WindowsFormsApplication1\WindowsFormsApplication1
............此处省略4个文件信息
相关资源
- visual studio 2017中英文企业版离线安装
- System.Net.Http.dll.rar
- 简单的学生宿舍管理系统
- 医院药品进销存系统
- 判断接收到的字节流网络流是何种编
- 连接数据库并实现登录功能的
- Webrequests模拟登录
- Unity LitJson.dll
- 关于app.config共用的问题
- 基于UCC28019的高功率因数电源设计.r
- progressMy.zip
- 简单的信息隐藏和DES加密
- .net 网上购物课程设计
- Mac地址修改源代码
- fluentftp组件
- 班级管理系统
- 向指定ip端口发送数据并等待接收
- 实现字符串异或操作
- PDA自动更新程序源码
- ERP数据模型用powerdesigner建模
- 兰勃特墨卡托投影.rar
- .Net Core 基本代码
-
dsofr
amer.ocx 2.3.0.2版本 - 人员信息管理系统,简易人员信息管
- datagridview添加一个合计行[美好实用版
- CSharp_Region类的方法使用图解
- CoreAudioApi.dll
- [深入浅出AutoCAD.NET二次开发].李冠亿
- edtFTPnetPro v8.6.5
- DiDaJiangCheng.sln三层架构学生学籍管理
评论
共有 条评论