SEARU.ORG
当前位置:SEARU.ORG > Linux 软件 > 正文

数据采集爬虫框架 DotnetSpider

DotnetSpider是开源的.NET跨平台数据采集爬虫框架。需要 Scheduler,Downloader ,Processor,Pipeline 四部分。

  public static void Main()
    {
        HttpClientDownloader downloader = new HttpClientDownloader();

        Core.Spider spider = Core.Spider.Create(new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1);
        var site = new Site() { EncodingName = "UTF-8" };
        for (int i = 1; i < 5; ++i)
        {
            site.AddStartUrl("http://www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_1.html");
        }
        spider.Site = site;
        spider.Start();
    }

    private class MyPipeline : IPipeline
    {
        public void Process(ResultItems resultItems, ISpider spider)
        {
            foreach (YoukuVideo entry in resultItems.Results["VideoResult"])
            {
                Console.WriteLine($"{entry.Name}:{entry.Click}");
            }

            //May be you want to save to database
            // 
        }

        public void Dispose()
        {
        }
    }

    private class MyPageProcessor : IPageProcessor
    {
        public void Process(Page page)
        {
            var totalVideoElements = page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-col3']")).Nodes();
            List<YoukuVideo> results = new List<YoukuVideo>();
            foreach (var videoElement in totalVideoElements)
            {
                var video = new YoukuVideo();
                video.Name = videoElement.Select(Selectors.XPath("/div[4]/div[1]/a")).Value;
                video.Click = int.Parse(videoElement.Select(Selectors.Css("p-num")).Value.ToString());
                results.Add(video);
            }
            page.AddResultItem("VideoResult", results);
        }

        public Site Site => new Site { SleepTime = 0 };
    }

    public class YoukuVideo
    {
        public string Name { get; set; }
        public string Click { get; set; }
    }

未经允许不得转载:SEARU.ORG » 数据采集爬虫框架 DotnetSpider

赞 (0)
分享到:更多 ()

评论 0