• 大小: 25KB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2021-06-06
  • 语言: 其他
  • 标签: 爬虫  

资源简介

一个图片爬虫和音乐爬虫可以完美运行,如又不懂可以看我的博客

资源截图

代码片段和文件信息

package crawlers;

import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.baseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.seimicrawler.xpath.JXDocument;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.util.Date;
import java.util.List;

@Crawler(name = “basic“)
public class Basic extends baseSeimiCrawler {
    @Override
    public String[] startUrls() {
        return new String[]{“https://www.csdn.net/“};
    }
    @Override
    public void start(Response response) {
        JXDocument doc = response.document();
        try {
            List urls = doc.sel(“//a/@href“);
            logger.info(“{}“ urls.size());
            for (object s:urls){
                push(new Request(s.toString()Basic::gettitle));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    public void gettitle(Response response){
        JXDocument doc = response.document();
        List urls2 =doc.sel(“//img/@src“);
        Download(urls2);

        try {
            logger.info(“url:{} {}“ response.getUrl() doc.sel(“//img/@src“));
            //do something
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    //下载图片
    private void Download(List listImgSrc) {
        try {
            //开始时间
            Date begindate = new Date();
            for (String url : listImgSrc) {
                //开始时间
                Date begindate2 = new Date();
                String imageName = url.substring(url.lastIndexOf(“/“) + 1 url.length());
                URL uri = new URL(url);
                InputStream in = uri.openStream();
                FileOutputStream fo = new FileOutputStream(new File(“F:/image/“+imageName));//文件输出流
                byte[] buf = new byte[1024];
                int length = 0;
                System.out.println(“开始下载:“ + url);
                while ((length = in.read(buf 0 buf.length)) != -1) {
                    fo.write(buf 0 length);
                }
                //关闭流
                in.close();
                fo.close();
                System.out.println(imageName + “下载完成“);
                //结束时间
                Date overdate2 = new Date();
                double time = overdate2.getTime() - begindate2.getTime();
                System.out.println(“耗时:“ + time / 1000 + “s“);
            }
            Date overdate = new Date();
            double time = overdate.getTime() - begindate.getTime();
            System.out.println(“总耗时:“ + time / 1000 + “s“);
        } catch (Exception e) {
            System.out.println(“下载失败“);
        }
    }
}

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-01-10 11:56  pachong1\
     目录           0  2019-01-10 11:56  pachong1\.idea\
     文件         541  2019-01-10 09:26  pachong1\.idea\compiler.xml
     文件         138  2019-01-09 10:12  pachong1\.idea\encodings.xml
     文件         526  2019-01-09 10:12  pachong1\.idea\misc.xml
     文件       18720  2019-01-10 11:56  pachong1\.idea\workspace.xml
     文件          81  2019-01-09 10:12  pachong1\pachong1.iml
     文件        1149  2019-01-10 10:27  pachong1\pom.xml
     目录           0  2019-01-10 11:56  pachong1\src\
     目录           0  2019-01-10 11:56  pachong1\src\main\
     目录           0  2019-01-10 11:56  pachong1\src\main\java\
     目录           0  2019-01-10 11:56  pachong1\src\main\java\crawlers\
     文件        2831  2019-01-10 10:12  pachong1\src\main\java\crawlers\Basic.java
     文件        2711  2019-01-10 10:26  pachong1\src\main\java\crawlers\FileDownload.java
     文件        1839  2019-01-10 10:26  pachong1\src\main\java\crawlers\HtmlManage.java
     文件        3333  2019-01-10 10:26  pachong1\src\main\java\crawlers\HttpGetConnect.java
     文件        2972  2019-01-10 10:36  pachong1\src\main\java\crawlers\SpiderKugou.java
     文件         223  2019-01-09 10:15  pachong1\src\main\java\crawlers\test.java
     目录           0  2019-01-09 10:12  pachong1\src\main\resources\
     目录           0  2019-01-10 11:56  pachong1\src\test\
     目录           0  2019-01-09 10:12  pachong1\src\test\java\
     目录           0  2019-01-10 11:56  pachong1\target\
     目录           0  2019-01-10 11:56  pachong1\target\classes\
     目录           0  2019-01-10 11:56  pachong1\target\classes\crawlers\
     文件        5055  2019-01-10 10:36  pachong1\target\classes\crawlers\Basic.class
     文件        3825  2019-01-10 10:36  pachong1\target\classes\crawlers\FileDownload.class
     文件        2887  2019-01-10 10:36  pachong1\target\classes\crawlers\HtmlManage.class
     文件        4629  2019-01-10 10:36  pachong1\target\classes\crawlers\HttpGetConnect.class
     文件        4326  2019-01-10 10:36  pachong1\target\classes\crawlers\SpiderKugou.class
     文件         545  2019-01-10 10:36  pachong1\target\classes\crawlers\test.class
     目录           0  2019-01-10 11:56  pachong1\target\generated-sources\
............此处省略1个文件信息

评论

共有 条评论