C#网络爬虫与搜索引擎调研的代码详情介绍

黄舟

发布时间：2017-03-03 13:12:47

1833人浏览过

来源于php中文网

原创

效果页面：

大致思路：

一个入口链接，例如：www.sina.com.cn，从它入手开始爬，找到了链接，（在此可以解析出网页内容，输入一个关键字，判读是否包含输入的关键字，包含就把这个链接以及网页相关内容放入缓存），把爬到的连接放入缓存，递归执行。

做的比较简陋，算是自己总结一下。

同时启动10个线程，每个线程对应各自的连接池缓存，把包含关键字的连接都放入同一个缓存里面，准备一个service页面，定时刷新，显示当前的结果（仅仅是模拟，真正的搜索引擎一定是先用分词法对关键字进行解析，然后结合网页内容把符合条件的网页和连接存到文件里面，下次搜索的时候一定是从文件里面找结果，它们的爬虫24小时爬）。下面看一下具体实现。

实体类：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Threading;
namespace SpiderDemo.Entity
{
////爬虫线程
    publicclass ClamThread
    {
       public Thread _thread { get; set; }
       public List lnkPool { get; set; }
}
 
////爬到的连接
  publicclass Link
    {
       public string Href { get; set; }
       public string LinkName { get; set; }
       public string Context { get; set; }
 
       public int TheadId { get; set; }
    }
 
}

缓存类：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using SpiderDemo.Entity;
using System.Threading;
 
namespace SpiderDemo.SearchUtil
{
   public static class CacheHelper
    {
       public static bool EnableSearch;
 
       /// 
       /// 起始URL
       /// 
       public const string StartUrl = "http://www.sina.com.cn";
 
 
       /// 
       /// 爬取的最大数量，性能优化一下，如果可以及时释放资源就可以一直爬了
       /// 
       public const int MaxNum = 300;
 
       /// 
       /// 最多爬出1000个结果
       /// 
       public const int MaxResult = 1000;
 
 
       /// 
       /// 当前爬到的数量
       /// 
       public static int SpideNum;
 
       /// 
       /// 关键字
        /// 
       public static string KeyWord;
 
       /// 
       /// 运行时间
       /// 
       public static int RuningTime;
 
       /// 
       /// 最多运行时间
       /// 
       public static int MaxRuningtime;
 
       /// 
       /// 10个线程同时去爬
       /// 
       public static ClamThread[] ThreadList = new ClamThread[10];
 
       /// 
       /// 第一次爬到的连接，连接池
       /// 
       public static List LnkPool = new List();
 
       /// 
       /// 拿到的合法连接
       /// 
       public static List validLnk = new List();
 
       /// 
       /// 拿连接的时候  不要拿同样的
       /// 
       public static readonly object syncObj = new object();
    }
}

HTTP请求类：

Cutout.Pro抠图

AI批量抠图去背景

下载

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Threading;
 
namespace SpiderDemo.SearchUtil
{
   public static class HttpPostUtility
    {
       /// 
       /// 暂时写成同步的吧，等后期再优化
       /// 
       /// 
       /// 
       public static Stream SendReq(string url)
       {
           try
           {
                if (string.IsNullOrEmpty(url)){
                    return null;
                }
                // WebProxy wp = newWebProxy("10.0.1.33:8080");
                //wp.Credentials = new System.Net.NetworkCredential("*****","******", "feinno");///之前需要使用代理才能
 
                HttpWebRequest myRequest =(HttpWebRequest)WebRequest.Create(url);
                //myRequest.Proxy = wp;
                HttpWebResponse myResponse =(HttpWebResponse)myRequest.GetResponse();
 
                returnmyResponse.GetResponseStream();
           }
           ////给一些网站发请求权限会受到限制
           catch (Exception ex)
           {
                return null;
           }
       }
    }
}

解析网页类，这里用到了一个组件，HtmlAgilityPack.dll，很好用，下载连接：http://www.php.cn/

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Threading;
using System.Text;
using System.Xml;
using System.Xml.Linq;
using HtmlAgilityPack;
using System.IO;
using SpiderDemo.Entity;
namespace SpiderDemo.SearchUtil
{
    public static class UrlAnalysisProcessor
    {
 
       public static void GetHrefs(Link url, Stream s, ListlnkPool)
       {
           try
           {
                ////没有HTML流，直接返回
                if (s == null)
                {
                    return;
                }
 
                ////解析出连接往缓存里面放，等着前面页面来拿，目前每个线程最多缓存300个，多了就别存了，那边取的太慢了！
                if (lnkPool.Count >=CacheHelper.MaxNum)
                {
                    return;
                }
 
                ////加载HTML，找到了HtmlAgilityPack，试试这个组件怎么样
                HtmlAgilityPack.HtmlDocumentdoc = new HtmlDocument();
 
                ////指定了UTF8编码，理论上不会出现中文乱码了
                doc.Load(s, Encoding.Default);
 
                /////获得所有连接
                IEnumerable nodeList=
doc.DocumentNode.SelectNodes("//a[@href]");////抓连接的方法，详细去看stackoverflow里面的：
////http://www.php.cn/
 
                ////移除脚本
                foreach (var script indoc.DocumentNode.Descendants("script").ToArray())
                    script.Remove();
 
                ////移除样式
                foreach (var style indoc.DocumentNode.Descendants("style").ToArray())
                    style.Remove();
 
                string allText =doc.DocumentNode.InnerText;
                int index = 0;
                ////如果包含关键字，为符合条件的连接
                if ((index =allText.IndexOf(CacheHelper.KeyWord)) != -1)
                {
                    ////把包含关键字的上下文取出来，取40个字符吧
                    if (index > 20&& index < allText.Length - 20 - CacheHelper.KeyWord.Length)
                    {
                        string keyText =allText.Substring(index - 20, index) +
                          "" + allText.Substring(index,CacheHelper.KeyWord.Length) + " " +
                           allText.Substring(index +CacheHelper.KeyWord.Length, 20) + "
";////关键字突出显示
 
                        url.Context = keyText;
                    }
 
 
                   CacheHelper.validLnk.Add(url);
                   //RecordUtility.AppendLog(url.LinkName + "
");
                    ////爬到了一个符合条件的连接，计数器+1
                    CacheHelper.SpideNum++;
                }
 
                foreach (HtmlNode node innodeList)
                {
                    if(node.Attributes["href"] == null)
                   {
                        continue;
                    }
                    else
                    {
 
                        Link lk = new Link()
                        {
                            Href =node.Attributes["href"].Value,
                            LinkName ="" + node.InnerText + "  " +
                           node.Attributes["href"].Value + "" +"
"
                        };
                        if(lk.Href.StartsWith("javascript"))
                        {
                            continue;
                        }
                        else if(lk.Href.StartsWith("#"))
                        {
                           continue;
                        }
                        else if(lnkPool.Contains(lk))
                        {
                            continue;
                        }
                        else
                        {
                            ////添加到指定的连接池里面
                            lnkPool.Add(lk);
 
                        }
                    }
                }
 
 
 
           }
 
           catch (Exception ex)
           {
 
           }
       }
    }
}

搜索页面CODE BEHIND：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using SpiderDemo.SearchUtil;
using System.Threading;
using System.IO;
using SpiderDemo.Entity;
 
namespace SpiderDemo
{
   public partial class SearchPage : System.Web.UI.Page
    {
 
       protected void Page_Load(object sender, EventArgs e)
       {
           if (!IsPostBack)
           {
                InitSetting();
           }
       }
 
       private void InitSetting()
       {
         
       }
 
       private void StartWork()
       {
           CacheHelper.EnableSearch = true;
           CacheHelper.KeyWord = txtKeyword.Text;
 
           ////第一个请求给新浪，获得返回的HTML流
           Stream htmlStream = HttpPostUtility.SendReq(CacheHelper.StartUrl);
 
           Link startLnk = new Link()
           {
                Href = CacheHelper.StartUrl,
                LinkName = " 新浪 " +CacheHelper.StartUrl + " "
           };
 
           ////解析出连接
           UrlAnalysisProcessor.GetHrefs(startLnk, htmlStream,CacheHelper.LnkPool);
 
           
           
           for (int i = 0; i < CacheHelper.ThreadList.Length; i++)
           {
                CacheHelper.ThreadList[i] = newClamThread();
               CacheHelper.ThreadList[i].lnkPool = new List();
           }
 
           ////把连接平分给每个线程
           for (int i = 0; i < CacheHelper.LnkPool.Count; i++)
           {
                int tIndex = i %CacheHelper.ThreadList.Length;
               CacheHelper.ThreadList[tIndex].lnkPool.Add(CacheHelper.LnkPool[i]);
           }
 
           Action clamIt = new Action((clt)=>
           {
 
                Stream s =HttpPostUtility.SendReq(clt.lnkPool[0].Href);
                DoIt(clt, s, clt.lnkPool[0]);
           });
 
 
           for (int i = 0; i < CacheHelper.ThreadList.Length; i++)
           {
               CacheHelper.ThreadList[i]._thread = new Thread(new ThreadStart(() =>
                {
                   clamIt(CacheHelper.ThreadList[i]);
                }));
 
                /////每个线程开始工作的时候，休眠100ms
               CacheHelper.ThreadList[i]._thread.Start();
                Thread.Sleep(100);
           }
         
 
       }
 
       private void DoIt(ClamThreadthread, Stream htmlStream, Link url)
       {
 
           if (!CacheHelper.EnableSearch)
           {
                return;
           }
 
           if (CacheHelper.SpideNum > CacheHelper.MaxResult)
           {
               return;
           }
 
           ////解析页面,URL符合条件放入缓存，并把页面的连接抓出来放入缓存
           UrlAnalysisProcessor.GetHrefs(url, htmlStream, thread.lnkPool);
 
           ////如果有连接，拿第一个发请求，没有就结束吧，反正这么耗资源的东西
           if (thread.lnkPool.Count > 0)
           {
                Link firstLnk;
                firstLnk = thread.lnkPool[0];
                ////拿到连接之后就在缓存中移除
               thread.lnkPool.Remove(firstLnk);
 
                firstLnk.TheadId =Thread.CurrentThread.ManagedThreadId;
               Stream content =HttpPostUtility.SendReq(firstLnk.Href);
 
                DoIt(thread, content,firstLnk);
           }
           else
           {
                //没连接了，停止吧,看其他线程的表现
                thread._thread.Abort();
           }
       }
 
       protected void btnSearch_Click(object sender, EventArgs e)
       {
           this.StartWork();
 
       }
 
       protected void btnShow_Click(object sender, EventArgs e)
       {
 
       }
 
       protected void btnStop_Click(object sender, EventArgs e)
       {
           foreach (var t in CacheHelper.ThreadList)
           {
                t._thread.Abort();
               t._thread.DisableComObjectEagerCleanup();
           }
           CacheHelper.EnableSearch =false;
           //CacheHelper.ValidLnk.Clear();
           CacheHelper.LnkPool.Clear();
           CacheHelper.validLnk.Clear();
       }
    }
}

搜索页面前台代码：

<%@ Page Language="C#"AutoEventWireup="true" CodeBehind="SearchPage.aspx.cs"Inherits="SpiderDemo.SearchPage" %>
 

 


   


   
   
    关键字：
   
         
   
   
   
   
    
  
  
  
  
 
   


 
 
ShowPage.aspx（嵌在SearchPage里面，ajax请求一个handler）：
 
<%@ Page Language="C#"AutoEventWireup="true" CodeBehind="ShowPage.aspx.cs"Inherits="SpiderDemo.ShowPage" %>

StateServicePage.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using SpiderDemo.SearchUtil;
using SpiderDemo.Entity;
 
namespace SpiderDemo
{
   /// 
   /// StateServicePage 的摘要说明
   /// 
   public class StateServicePage : IHttpHandler
    {
 
       public void ProcessRequest(HttpContext context)
       {
           context.Response.ContentType = "text/plain";
 
 
           if (context.Request["op"] != null &&context.Request["op"] == "info")
           {
               context.Response.Write(ShowState());
           }
       }
 
 
       public string ShowState()
       {
           StringBuilder sbRet = new StringBuilder(100);
           string ret = GetValidLnkStr();
 
           int count = 0;
           
                for (int i = 0; i ");
           sbRet.AppendLine("连接池总数: " + count + "
");
           sbRet.AppendLine("搜索结果：
 " + ret);
 
           return sbRet.ToString();
       }
 
       private string GetValidLnkStr()
       {
           StringBuilder sb = new StringBuilder(120);
           Link[] cloneLnk = new Link[CacheHelper.validLnk.Count];
 
           CacheHelper.validLnk.CopyTo(cloneLnk, 0);
 
           for (int i = 0; i < cloneLnk.Length; i++)
           {
                sb.AppendLine("
" + cloneLnk[i].LinkName + "
" +cloneLnk[i].Context);
           }
 
            return sb.ToString();
       }
 
 
       public bool IsReusable
       {
           get
           {
                return false;
           }
       }
    }
}

以上就是C#网络爬虫与搜索引擎调研的代码详情介绍的内容，更多相关内容请关注PHP中文网（www.php.cn）！

c# 如何安全地从多线程环境调用非线程安全的代码

C# WebAssemblytime (Wasmtime) for .NET方法 C#如何运行WASM模块

c# websocket 编程入门

C# GCHandle使用方法 C#如何固定托管对象以传递给非托管代码

C# 源代码生成器方法 C#如何创建自己的Source Generator