资源简介
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using HtmlAgilityPack;
using System.Net;
namespace PacksModels
{
public class HtmlAgilityPackHelper
{
public static string getHtml(string url, string charSet)
{
string html = QueryHtml(url, charSet);
while (html == "isExp"||html==null)
{
html = QueryHtml(url, charSet);
}
return html;
}
//获取网页源码
public static string QueryHtml(string url, string charSet)
{
bool isExp = false;
Byte[] pageData = null;
XWebClient wc = new XWebClient();
try
{
if (url == null || url.Trim() == "")
return null;
//XWebClient wc = new XWebClient();
wc.Credentials = CredentialCache.DefaultCredentials;
wc.Headers["User-Agent"] = "blah";
//Helpers.WriteLog("下载html资源开始:" url, "Log\\error.log");
pageData = wc.DownloadData(url);
//Helpers.WriteLog("下载html资源结束:" url, "Log\\error.log");
}
catch (WebException ex)
{
isExp = true;
if (ex.ToString().Contains("未能解析此远程名称"))
{
Helpers.WriteLog("未能解析此远程名称,请检查网络,正在重试下载此资源...:" DateTime.Now.ToString() ":" url, "Log\\error.log");
}
else if (ex.ToString().Contains("操作超时") || ex.ToString().Contains("操作已超时"))
{
Helpers.WriteLog("操作超时,请检查资源请求频率,正在重试下载此资源...:" DateTime.Now.ToString() ":" url, "Log\\error.log");
}
else
{
Helpers.WriteLog("发送请求期间异常,请检查网络:" DateTime.Now.ToString() ":" ex.ToString(), "Log\\error.log");
}
//释放资源
wc.Dispose();
Helpers.WriteLog("释放资源等1分钟重试:" DateTime.Now.ToString() ":" url, "Log\\error.log");//
System.Threading.Thread.Sleep(60000); //延时30秒
Helpers.WriteLog("开始重试:" DateTime.Now.ToString() ":" url, "Log\\error.log");//
}
if (pageData == null)
{
return null;
}
else if (isExp)
{
return "isExp";
}
string charset = "";
var r_utf8 = new System.IO.StreamReader(new System.IO.MemoryStream(pageData), Encoding.UTF8); //将html放到utf8编码的StreamReader内
var r_gbk = new System.IO.StreamReader(new System.IO.MemoryStream(pageData), Encoding.Default); //将html放到gbk编码的StreamReader内
var t_utf8 = r_utf8.ReadToEnd(); //读出html内容
var t_gbk = r_gbk.ReadToEnd(); //读出html内容
bool aa = isLuan(t_utf8);
bool bb = isLuan(t_gbk);
bool aa1 = isLuan1(t_utf8);
bool bb1 = isLuan1(t_gbk);
if (!isLuan(t_utf8)) //判断utf8是否有乱码
{
charset = "utf-8";
}
else
{
charset = "gbk";
}
//System.Threading.Thread.Sleep(60000); //延时1分钟
return Encoding.GetEncoding(charset).GetString(pageData);
}
/// <summary>
/// 判断是否有乱码
/// </summary>
/// <param name="txt">需判断的文本</param>
/// <returns></returns>
private static bool isLuan(string txt)
{
var bytes = Encoding.UTF8.GetBytes(txt);
//239 191 189
for (var i = 0; i < bytes.Length; i )
{
if (i < bytes.Length - 3)
if (bytes[i] == 239 && bytes[i 1] == 191 && bytes[i 2] == 189)
{
return true;
}
}
return false;
}
private static bool isLuan1 (string txt)
{
var bytes = Encoding.ASCII.GetBytes(txt);
//239 191 189
for (var i = 0; i < bytes.Length; i )
{
//if (i < bytes.Length - 3)
// if (bytes[i] == 239 && bytes[i 1] == 191 && bytes[i 2] == 189)
// {
// return true;
// }
if (bytes[i]>255)
{
return true;
}
}
return false;
}
/**///// <summary>
/// 判断句子中是否含有中文
/// </summary>
/// <param >字符串</param>
private static bool WordsIScn(string words)
{
string TmmP;
for (int i = 0; i < words.Length; i )
{
TmmP = words.Substring(i, 1);
byte[] sarr = System.Text.Encoding.GetEncoding("gb2312").GetBytes(TmmP);
if (sarr.Length == 2)
{
return true;
}
}
return false;
}
/// <summary>
/// 获得html代码块的节点集合
/// </summary>
/// <param name="url"></param>
/// <param name="xpath"></param>
/// <returns></returns>
public static HtmlNodeCollection GetHtmlNodes(string url, string xpath)
{
HtmlNodeCollection navNodes = null;
try
{
//获取html源码
string htmlStr = getHtml(url.Trim(), "");
//实例化HtmlAgilityPack.HtmlDocument对象
HtmlDocument doc = new HtmlDocument();
//载入HTML
doc.LoadHtml(htmlStr);
//根据Xpath节点NODE的ID获取节点集
navNodes = doc.DocumentNode.SelectNodes(xpath);
}
catch (Exception ex)
{
Helpers.WriteLog("获取节点集异常:" ex.ToString() ":" url, "Log\\error.log");
}
return navNodes;
}
/// <summary>
/// 获得html代码的节点
/// </summary>
/// <param name="url"></param>
/// <param name="xpath"></param>
/// <returns></returns>
public static HtmlNode GetNode(HtmlDocument doc, string xpath)
{
//根据节点
HtmlNode navNode = null;
try
{
navNode = doc.DocumentNode.SelectSingleNode(xpath);
}
catch (Exception ex)
{
Helpers.WriteLog("获取单节点异常:" ex.ToString(), "Log\\error.log");
}
return navNode;
}
}
}
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using HtmlAgilityPack;
using System.Net;
namespace PacksModels
{
public class HtmlAgilityPackHelper
{
public static string getHtml(string url, string charSet)
{
string html = QueryHtml(url, charSet);
while (html == "isExp"||html==null)
{
html = QueryHtml(url, charSet);
}
return html;
}
//获取网页源码
public static string QueryHtml(string url, string charSet)
{
bool isExp = false;
Byte[] pageData = null;
XWebClient wc = new XWebClient();
try
{
if (url == null || url.Trim() == "")
return null;
//XWebClient wc = new XWebClient();
wc.Credentials = CredentialCache.DefaultCredentials;
wc.Headers["User-Agent"] = "blah";
//Helpers.WriteLog("下载html资源开始:" url, "Log\\error.log");
pageData = wc.DownloadData(url);
//Helpers.WriteLog("下载html资源结束:" url, "Log\\error.log");
}
catch (WebException ex)
{
isExp = true;
if (ex.ToString().Contains("未能解析此远程名称"))
{
Helpers.WriteLog("未能解析此远程名称,请检查网络,正在重试下载此资源...:" DateTime.Now.ToString() ":" url, "Log\\error.log");
}
else if (ex.ToString().Contains("操作超时") || ex.ToString().Contains("操作已超时"))
{
Helpers.WriteLog("操作超时,请检查资源请求频率,正在重试下载此资源...:" DateTime.Now.ToString() ":" url, "Log\\error.log");
}
else
{
Helpers.WriteLog("发送请求期间异常,请检查网络:" DateTime.Now.ToString() ":" ex.ToString(), "Log\\error.log");
}
//释放资源
wc.Dispose();
Helpers.WriteLog("释放资源等1分钟重试:" DateTime.Now.ToString() ":" url, "Log\\error.log");//
System.Threading.Thread.Sleep(60000); //延时30秒
Helpers.WriteLog("开始重试:" DateTime.Now.ToString() ":" url, "Log\\error.log");//
}
if (pageData == null)
{
return null;
}
else if (isExp)
{
return "isExp";
}
string charset = "";
var r_utf8 = new System.IO.StreamReader(new System.IO.MemoryStream(pageData), Encoding.UTF8); //将html放到utf8编码的StreamReader内
var r_gbk = new System.IO.StreamReader(new System.IO.MemoryStream(pageData), Encoding.Default); //将html放到gbk编码的StreamReader内
var t_utf8 = r_utf8.ReadToEnd(); //读出html内容
var t_gbk = r_gbk.ReadToEnd(); //读出html内容
bool aa = isLuan(t_utf8);
bool bb = isLuan(t_gbk);
bool aa1 = isLuan1(t_utf8);
bool bb1 = isLuan1(t_gbk);
if (!isLuan(t_utf8)) //判断utf8是否有乱码
{
charset = "utf-8";
}
else
{
charset = "gbk";
}
//System.Threading.Thread.Sleep(60000); //延时1分钟
return Encoding.GetEncoding(charset).GetString(pageData);
}
/// <summary>
/// 判断是否有乱码
/// </summary>
/// <param name="txt">需判断的文本</param>
/// <returns></returns>
private static bool isLuan(string txt)
{
var bytes = Encoding.UTF8.GetBytes(txt);
//239 191 189
for (var i = 0; i < bytes.Length; i )
{
if (i < bytes.Length - 3)
if (bytes[i] == 239 && bytes[i 1] == 191 && bytes[i 2] == 189)
{
return true;
}
}
return false;
}
private static bool isLuan1 (string txt)
{
var bytes = Encoding.ASCII.GetBytes(txt);
//239 191 189
for (var i = 0; i < bytes.Length; i )
{
//if (i < bytes.Length - 3)
// if (bytes[i] == 239 && bytes[i 1] == 191 && bytes[i 2] == 189)
// {
// return true;
// }
if (bytes[i]>255)
{
return true;
}
}
return false;
}
/**///// <summary>
/// 判断句子中是否含有中文
/// </summary>
/// <param >字符串</param>
private static bool WordsIScn(string words)
{
string TmmP;
for (int i = 0; i < words.Length; i )
{
TmmP = words.Substring(i, 1);
byte[] sarr = System.Text.Encoding.GetEncoding("gb2312").GetBytes(TmmP);
if (sarr.Length == 2)
{
return true;
}
}
return false;
}
/// <summary>
/// 获得html代码块的节点集合
/// </summary>
/// <param name="url"></param>
/// <param name="xpath"></param>
/// <returns></returns>
public static HtmlNodeCollection GetHtmlNodes(string url, string xpath)
{
HtmlNodeCollection navNodes = null;
try
{
//获取html源码
string htmlStr = getHtml(url.Trim(), "");
//实例化HtmlAgilityPack.HtmlDocument对象
HtmlDocument doc = new HtmlDocument();
//载入HTML
doc.LoadHtml(htmlStr);
//根据Xpath节点NODE的ID获取节点集
navNodes = doc.DocumentNode.SelectNodes(xpath);
}
catch (Exception ex)
{
Helpers.WriteLog("获取节点集异常:" ex.ToString() ":" url, "Log\\error.log");
}
return navNodes;
}
/// <summary>
/// 获得html代码的节点
/// </summary>
/// <param name="url"></param>
/// <param name="xpath"></param>
/// <returns></returns>
public static HtmlNode GetNode(HtmlDocument doc, string xpath)
{
//根据节点
HtmlNode navNode = null;
try
{
navNode = doc.DocumentNode.SelectSingleNode(xpath);
}
catch (Exception ex)
{
Helpers.WriteLog("获取单节点异常:" ex.ToString(), "Log\\error.log");
}
return navNode;
}
}
}
代码片段和文件信息
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using HtmlAgilityPack;
using System.Net;
namespace PacksModels
{
public class HtmlAgilityPackHelper
{
public static string getHtml(string url string charSet)
{
string html = QueryHtml(url charSet);
while (html == “isExp“||html==null)
{
html = QueryHtml(url charSet);
}
return html;
}
//获取网页源码
public static string QueryHtml(string url string charSet)
{
bool isExp = false;
Byte[] pageData = null;
XWebClient wc = new XWebClient();
try
{
相关资源
- asp.net 网页静态化组件(shipingx-Stati
- c# 同时将图片和文字复制到剪贴版 (
- HTML压缩
- Word文件转Html文档目录
- html5 绘图以及 动画
- bbsharp bbcode 转html 写的很简洁
- MVC通过过滤器 实现输出前对html修改(
- MVC输出前对html修改
- 带Html编辑器CSkin.dll版本
- NHtmlFilter1.0过滤Html危险脚本 防止XSS攻
- Html文件上传控件(整理前台使用版)
- html5简单进度条效(progressbar)
- QQ聊天记录Mht转Html格式(附工具源码
- WinformHTMLEditor winform 富文本编辑器
- 纯HTML弹出必填信息(popHint)
- C#读取HTML文件并插入到数据库
- C#实现WebSocket源码c#写的服务端html写的
- Web网页控制摄像头
- HtmlAgilityPack 1.11.2最新版本
- Winista.Htmlparser.Net 源码 +Demo
- c# winform html编辑器
- Winform中显示HTML富文本编辑器
- c#用webkit内核支持html5
- ASP.NET实现网页快照/网页截图(将ht
- HTML5 WebSocket 构建实时 Web 应用
- html5 实时推送消息到客户端(SSE/Eve
- C# Word檔轉Html檔範例
- C# 剪贴板功能 同时黏贴图片和文字等
- 修改 webbrowser 支持IE8/IE9/IE10/HTML5 (
- 手表电商首页、注册页html模板
评论
共有 条评论