C # width-First Web Crawler

Source: Internet
Author: User
Tags baseuri

Using System; using System. collections. generic; using System. componentModel; using System. data; using System. drawing; using System. linq; using System. text; using System. windows. forms; using Tool; using System. net; using System. text. regularExpressions; using System. threading; namespace Search {public partial class Form1: Form {public Form1 () {InitializeComponent () ;}/ *** queue, save the URL */public class Queue {// use the linked list to implement the Queue private Queue list
 
  
Queue = new queue list
  
   
(); // Public void enQueue (string t) {queue. addLast (t);} // public string deQueue () {string o = queue. last. value; queue. removeLast (); return o;} // determines whether the queue is empty. public bool isQueueEmpty () {return queue. count> 0? False: true;} // determines whether the queue contains t public bool contians (string t) {return queue. contains (t);} public int getcount () {return queue. count ;}} public class LinkQueue {// The accessed url set private static ISet
   
    
VisitedUrl = new HashSet
    
     
(); // The url set to be accessed private static Queue unVisitedUrl = new Queue (); // obtain the URL Queue public static Queue getUnVisitedUrl () {return unVisitedUrl ;} // Add to the accessed URL queue public static void addVisitedUrl (String url) {visitedUrl. add (url);} // remove the accessed URL public static void removeVisitedUrl (String url) {visitedUrl. remove (url);} // unaccessed URL output queue public static Object unVisitedUrlDeQueue () {return unVisitedUrl. deQueue () ;}// Ensure that each URL is accessed only once. public static void addUnvisitedUrl (String url) {if (url! = Null &&! Url. Trim (). Equals ("")&&! VisitedUrl. Contains (url )&&! UnVisitedUrl. contians (url) unVisitedUrl. enQueue (url);} // obtain the number of accessed URLs. public static int getVisitedUrlNum () {return visitedUrl. count;} // determines whether the unaccessed URL queue is empty. public static bool unVisitedUrlsEmpty () {return unVisitedUrl. isQueueEmpty () ;}} string [] urlarr = new string [100]; private void button#click (object sender, EventArgs e) {zzHttp http = new zzHttp (); cookieContainer cookie = new CookieContain Er (); string url = textBox1.Text! = ""? TextBox1.Text: "http://image.baidu.com/"; string content = http. sendDataByGET (url, "", ref cookie); string baseUri = Utility. getBaseUri (url); string [] links = Parser. extractLinks (baseUri, content); foreach (string link in links) {richTextBox1.Text + = link; richTextBox1.Text + = "\ n";} Regex regImg = new Regex (@ "] *? \ Bsrc [\ s \ t \ r \ n] * = [\ s \ t \ r \ n] * ["']? [\ S \ t \ r \ n] * (? [^ \ S \ t \ r \ n "'<>] *) [^ <>] *? /? [\ S \ t \ r \ n] *> ", RegexOptions. ignoreCase); // search for the matched string MatchCollection matches = regImg. matches (content); Queue que = new Queue (); foreach (Match match in matches) que. enQueue (match. groups ["imgUrl"]. value); int k; for (k = 0; k <que. getcount (); k ++) {string picurl = que. deQueue (); richTextBox1.Text + = picurl; richTextBox1.Text + = "\ n"; string [] s = picurl. split ('/'); string picname = s [s. length-1]; ZzHttp. downfile (picurl, picname, @ "d: \ pic \");} label1.Text = k + "" ;}// search void search () {int I = 0; linkQueue. addUnvisitedUrl ("http://blog.csdn.net/zhujunxxxxx/"); while (! LinkQueue. unVisitedUrlsEmpty () & LinkQueue. getVisitedUrlNum () <= 1000) {// queue header URL output queue String visitUrl = (String) LinkQueue. unVisitedUrlDeQueue (); if (visitUrl = null) continue; zzHttp downLoader = new zzHttp (); CookieContainer cookie = new CookieContainer (); // download the webpage string content = downLoader. sendDataByGET (visitUrl, "", ref cookie); // put the URL into the accessed URL LinkQueue. addVisitedUrl (visitUrl); // extracts the URL string baseUri = Utility from the downloaded webpage. getBaseUri (visitUrl); string [] links = Parser. extractLinks (baseUri, content); // a new unaccessed URL is queued for I ++; Add2Message ("accessed quantity:" + LinkQueue. getVisitedUrlNum () + ", count =" + LinkQueue. getUnVisitedUrl (). getcount (); foreach (string link in links) {if (link. contains ("css") | link. contains ("js") | link. contains ("gif") | link. contains ("jpg") | link. contains ("png") | link. contains ("jpeg") continue; LinkQueue. addUnvisitedUrl (link); AddMessage (link) ;}} private void button2_Click (object sender, EventArgs e) {new Thread (search ). start ();} private delegate void InfoDelegate (string message); public void AddMessage (string message) {if (richTextBox1.InvokeRequired) // create a delegate {InfoDelegate d = new InfoDelegate (AddMessage); richTextBox1.Invoke (d, new object [] {message});} else {richTextBox1.AppendText (message + Environment. newLine); richTextBox1.ScrollToCaret () ;}} private delegate void Info2Delegate (string message); public void Add2Message (string message) {if (label2.InvokeRequired) // create the delegate {Info2Delegate d = new Info2Delegate (Add2Message); label2.Invoke (d, new object [] {message}) if the delegate cannot be accessed });} else {label2.Text = message ;}}}}
    
   
  
 

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.