资源简介
微博数据爬取demo ,解析微博评论数,点赞数,图片链接等
代码片段和文件信息
package top.kittygirl.wechat;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
public class cpwsDataCrawler extends BreadthCrawler {
/**
* @param crawlPath crawlPath is the path of the directory which maintains
* information of this crawler
* @param autoParse if autoParse is trueBreadthCrawler will auto extract
* links which match regex rules from pag
*/
public cpwsDataCrawler(String crawlPath boolean autoParse) {
super(crawlPath autoParse);
/*start page*/
//this.addSeed(“http://news.xidian.edu.cn/“);
// http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6
/*fetch url like http://news.hfut.edu.cn/show-xxxxxxhtml*/
//this.addRegex(“http://news.xidian.edu.cn/info/.*htm“);
/*do not fetch jpg|png|gif*/
//this.addRegex(“-.*\\.(jpg|png|gif).*“);
/*do not fetch url contains #*/
// this.addRegex(“-.*#.*“);
this.addSeed(“http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6“);
setThreads(50);
getConf().setTopN(1000);
getConf().setExecuteInterval(100000);
}
public void visit(Page page CrawlDatums next) {
String a = page.select(“#list“).select(“#resultList“).select(“#dataItem1“).text();
// String url = page.url();
// System.out.println(url);
/*if page is news page*/
/* if (page.matchUrl(“http://news.xidian.edu.cn/info/.*htm“)) {
*//*extract title and content of news by css selector*//*
String title = page.select(“div.neirong-bt“).text();
String date = page.select(“span#date“).text();
String clickNum = page.select(“div#wz_info.b_b“).first().child(3).select(“span“).first().child(0).select(“span“).val();
String content = page.selectText(“div#artibody“);
System.out.println(“URL:\n“ + url);
System.out.println(“title:\n“ + title);
System.out.println(“date:\n“ + date);
System.out.println(“clickNum:\n“ + clickNum);
System.out.println(“content:\n“ + content);
}*/
}
public static void main(String[] args) throws Exception {
cpwsDataCrawler crawler = new cpwsDataCrawler(“crawlllesZ“ true);
/*start crawl with depth of 4*/
crawler.start(1);
}
}
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2019-03-31 09:14 weChatCrawler-master\
目录 0 2019-03-31 09:14 weChatCrawler-master\.idea\
文件 624 2019-03-01 14:40 weChatCrawler-master\.idea\compiler.xm
文件 138 2019-03-01 14:40 weChatCrawler-master\.idea\encodings.xm
目录 0 2019-03-31 09:14 weChatCrawler-master\.idea\fileTemplates\
目录 0 2019-03-01 14:59 weChatCrawler-master\.idea\fileTemplates\code\
目录 0 2019-03-01 14:59 weChatCrawler-master\.idea\fileTemplates\includes\
目录 0 2019-03-01 14:59 weChatCrawler-master\.idea\fileTemplates\internal\
目录 0 2019-03-01 14:59 weChatCrawler-master\.idea\fileTemplates\j2ee\
目录 0 2019-03-31 09:14 weChatCrawler-master\.idea\libraries\
文件 504 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__cglib_cglib_nodep_3_2_4.xm
文件 642 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__cn_edu_hfut_dmic_webcollector_WebCollector_2_73_alpha.xm
文件 543 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__commons_codec_commons_codec_1_10.xm
文件 503 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__commons_io_commons_io_2_5.xm
文件 558 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__commons_logging_commons_logging_1_2.xm
文件 514 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_alibaba_fastjson_1_2_41.xm
文件 564 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_codeborne_phantomjsdriver_1_4_0.xm
文件 654 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_googlecode_juniversalchardet_juniversalchardet_1_0_3.xm
文件 515 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_google_code_gson_gson_2_8_0.xm
文件 499 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_google_guava_guava_21_0.xm
文件 480 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_sleepycat_je_5_0_73.xm
文件 536 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_squareup_okhttp3_okhttp_3_11_0.xm
文件 510 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__com_squareup_okio_okio_1_14_0.xm
文件 578 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__javax_servlet_javax_servlet_api_3_1_0.xm
文件 455 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__junit_junit_4_12.xm
文件 469 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__log4j_log4j_1_2_17.xm
文件 574 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__mysql_mysql_connector_java_5_1_31.xm
文件 492 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__net_java_dev_jna_jna_4_1_0.xm
文件 555 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__net_java_dev_jna_jna_platform_4_1_0.xm
文件 498 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__net_sf_opencsv_opencsv_2_3.xm
文件 577 2019-03-16 14:13 weChatCrawler-master\.idea\libraries\Maven__net_sourceforge_cssparser_cssparser_0_9_22.xm
............此处省略74个文件信息
评论
共有 条评论