资源简介
使用C#,xpath语法的简单爬虫 ,具有导出,下载execl功能
代码片段和文件信息
using NPOI.HSSF.UserModel;
using NPOI.SS.UserModel;
using System;
using System.Collections.Generic;
using System.Data;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
public partial class NewList : System.Web.UI.Page
{
protected void Page_Load(object sender EventArgs e)
{
}
protected void Button1_Click(object sender EventArgs e)
{
List> result = new List>();
var url = “http://www.cricchina.com/research/NewsList?cId=4“;
using (var ct = new WebClient())
{
ct.Encoding = Encoding.UTF8;
var resultHtml = ct.DownloadString(url);
//定义请求头部
ct.Headers.Add(“User-Agent“ “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/55.0.2883.87 Safari/537.36“);
ct.Headers.Add(“Accept“ “text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webp*/*;q=0.8“);
ct.Headers.Add(“Accept-Language“ “zh-CNzh;q=0.8“);
//ct.Headers.Add(“Cookie“ analyzeCookie(resultHtml)); //解析cookie用于二次访问获取数据列表
resultHtml = ct.DownloadString(url);//获取数据列表
//把html字符串转换成htmlDocument对象,方便解析数据,
var htmlDocument = new HtmlAgilityPack.HtmlDocument();
htmlDocument.LoadHtml(resultHtml);
//以XPath语法来解析html http://www.w3school.com.cn/xpath/
var html = htmlDocument.DocumentNode;
var list = html.SelectNodes(“//dl[@class=‘ev_p_dl‘]“);//找新闻数据数据
foreach (var item in list)
{
var dic = new Dictionary();
dic.Add(“href“ item.ChildNodes[1].ChildNodes[2].Attributes[“href“].Value);
dic.Add(“title“ item.ChildNodes[1].ChildNodes[2].InnerHtml);
dic.Add(“date“ item.ChildNodes[3].InnerHtml);
result.Add(dic);
}
if (result.Count > 0)
{
RenderToBrowser(RenderToExcel(toData(result)) System.DateTime.Now.Ticks + “.xls“);
}
}
}
public DataTable toData(List> list)
{
DataTable dte;
dte = new DataTable(“mid“);
DataColumn x1 = new DataColumn(“标题“ typeof(string));
DataColumn x2 = new DataColumn(“地址“ typeof(string));
DataColumn x3 = new DataColumn(“时间“ typeof(string));
dte.Columns.Add(x1);
dte.Columns.Add(x2);
dte.Columns.Add(x3);
foreach (var item in list)
{
DataRow dr = dte.NewRow();
dr[“标题“] = item[“title“];
dr[“地址“] = “http://www.cricchina.com/“+item[“href“];
dr[“时间“] = item[“date“];
dte.Rows.Add(dr);
}
//添加数据到DataTable
return dte;
}
public
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-09-18 17:55 CRIC\
目录 0 2017-09-18 17:49 CRIC\App_Data\
目录 0 2017-09-18 17:49 CRIC\App_Data\PublishProfiles\
文件 836 2017-09-18 17:55 CRIC\App_Data\PublishProfiles\CRIC.pubxm
文件 484 2017-09-18 16:09 CRIC\NewList.aspx
文件 4955 2017-09-18 17:52 CRIC\NewList.aspx.cs
文件 1306 2017-09-18 15:53 CRIC\Web.Debug.config
文件 387 2017-09-18 15:53 CRIC\Web.config
目录 0 2017-09-18 17:00 CRIC\bin\
文件 134656 2017-03-09 12:07 CRIC\bin\HtmlAgilityPack.dll
文件 298496 2017-03-09 12:07 CRIC\bin\HtmlAgilityPack.pdb
文件 1624064 2017-01-19 13:05 CRIC\bin\NPOI.dll
文件 2527 2017-09-18 17:40 CRIC\website.publishproj
- 上一篇:C# 有向图 邻接矩阵 路径查询
- 下一篇:阿基米德平面螺旋天线
评论
共有 条评论