Asp.net (c #) as a web data collection tool

Source: Internet
Author: User

This software has been used to input thousands of product data in one or two days. It can be seen that a lot of work is not done manually. As a programmer, it is to free up many people who often do repetitive and tedious work. The following is just some core code, and the collection must be linked to the corresponding website, Author: Zheng shaoqun

Copy codeThe Code is as follows:
// Extract the webpage on the final page of the product on the product list page
Private void button#click (object sender, EventArgs e)
{
If (textBox1.Text. Trim () = "" | textBox2.Text. Trim () = "")
{
MessageBox. Show ("the URL and domain name cannot be blank! "," Message prompt ", MessageBoxButtons. OK, MessageBoxIcon. Information );
Return;
}
Try
{
String Html = inc. GetHtml ("http://study.pctoday.net.cn ");
// ArrayList al = inc. GetMatchesStr (Html, "<a [^>] *?>. *? </A> ");
ArrayList al = inc. GetMatchesStr (Html, @ "href \ s * = \ s *(? : [\ '\ "" \ S] (? <1> [^ \ "" \ '] *) [\' \ ""]); // extract the link


"Title =" Replica Watches: "> Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, cartier Watches online Sale!
StringBuilder sb = new StringBuilder ();
Foreach (object var in al)
{
String a = var. ToString (). Replace ("\" "," "). Replace ("'","");
A = Regex. Replace (a, "href =", "", RegexOptions. IgnoreCase | RegexOptions. Multiline );
If (a. StartsWith ("/"))
A = textBox2.Text. Trim () +;
If (! A. StartsWith ("http ://"))
A = "http: //" +;
Sb. Append (a + "\ r \ n ");
}
TextBox5.Text = sb. ToString (); // output the extracted URL to a textBox, where each link occupies one line.



MessageBox. Show ("Extracted" + al. Count. ToString () + "Link", "message prompt", MessageBoxButtons. OK, MessageBoxIcon. Information );

}
Catch (Exception err)
{
MessageBox. Show ("extraction error! Cause: "+ err. Message," Message prompt ", MessageBoxButtons. OK, MessageBoxIcon. Information );
}

}




// Process the html code of the collected product page as a string, extract the required code, and save it to a local access database, at the same time, extract the product image address and automatically upload the image to the local images folder.

Private void backgroundworker=dowork (object sender, DoWorkEventArgs e)
{
// Fill in the product table
Database. ExecuteNonQuery ("delete from Tb_Product ");
DataTable dt2 = new DataTable ();
OleDbConnection conn = new OleDbConnection (Database. ConnectionStrings );
OleDbDataAdapter da = new OleDbDataAdapter ("select * from Tb_Product", conn );
OleDbCommandBuilder cb = new OleDbCommandBuilder (da );
Da. Fill (dt2 );
Dt2.Rows. Clear ();

BackgroundWorker worker = (BackgroundWorker) sender; // This is a progress bar.

String [] Urls = textBox5.Text. Trim (). ToLower (). Replace ("\ r \ n", ","). Split (',');
DataTable dt = new DataTable ();
StringBuilder ErrorStr = new StringBuilder ();
String html = "", ImageDir = AppDomain. CurrentDomain. BaseDirectory + "Images \\";

// Collect the website every time
For (int I = 0; I <Urls. Length; I ++)
{
Try
{
If (! Worker. CancellationPending)
{
If (Urls [I] = "")
Return;
Html = inc. GetHtml (Urls [I]); // obtain the html code of the url
DataRow NewRow = dt2.NewRow ();

// Product Name
String ProductName = html. Substring (html. IndexOf ("<title>") + 7 );
NewRow ["ProductName"] = ProductName. Remove (ProductName. IndexOf ("</title>"). Trim ();

// Product NO.
NewRow ["ModelId"] = NewRow ["ProductName"]. toString (). substring (NewRow ["ProductName"]. toString (). indexOf ("Model:") + 6 ). trim ();

// Product introduction, which are modified based on the html of different websites
String Introduce = html. Substring (html. IndexOf ("Product Details") + 26 );
Introduce = Introduce. Remove (Introduce. IndexOf ("</table>") + 8). Trim ()

NewRow ["Introduce"] = Introduce;



"Title =" Replica Watches: "> Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, cartier Watches online Sale!
// Download the image
String ProductImage = html. Substring (html. IndexOf ("align = center> ProductImage = textBox2.Text. Trim () + ProductImage. Substring (ProductImage. IndexOf ("src = \" ") + 5 );
ProductImage = ProductImage. Remove (ProductImage. IndexOf ("\""));
Try
{
Inc. DownFile (ProductImage, ImageDir + ProductImage. Substring (ProductImage. LastIndexOf ("/") + 1 ));
}
Catch (Exception)
{
ErrorStr. Append ("image download failed, image address:" + ImageDir + ProductImage. Substring (ProductImage. LastIndexOf ("/") + 1) + "\ r \ n ");
}


Dt2.Rows. Add (NewRow );

// Thread. Sleep (100 );
Worker. ReportProgress (I + 1) * 100/Urls. Length, I );
ToolStripStatusLabel1.Text = "processing progress:" + (I + 1). ToString () + "/" + Urls. Length. ToString (); // progress bar
}

}
Catch (Exception err)
{
ErrorStr. Append ("collection error:" + err. Message + "; Url:" + Urls [I] + "\ r \ n ");
}
}
Da. Update (dt2 );
DataBind (dt2 );
ShowError (ErrorStr. ToString ());
}

/// <Summary>
/// Generate a static Html page on the ASPX page by Zheng shaoqun
/// </Summary>
Public static string GetHtml (string url)
{
StreamReader sr = null;
String str = null;
// Read the remote path
WebRequest request = WebRequest. Create (url );
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Sr = new StreamReader (response. GetResponseStream (), Encoding. GetEncoding (response. CharacterSet ));
Str = sr. ReadToEnd ();
Sr. Close ();
Return str;
}


// Extract the URL from the HTML code
Public static ArrayList GetMatchesStr (string htmlCode, string strRegex)
{
ArrayList al = new ArrayList ();

Regex r = new Regex (strRegex, RegexOptions. IgnoreCase | RegexOptions. Multiline );
MatchCollection m = r. Matches (htmlCode );

For (int I = 0; I <m. Count; I ++)
{
Bool rep = false;
String strNew = m [I]. ToString ();

// Filter duplicate URLs
Foreach (string str in al)
{
If (strNew = str)
{
Rep = true;
Break;
}
}

If (! Rep) al. Add (strNew );
}

Al. Sort ();

Return al;
}

Public static void DownFile (string Url, string Path)
{

HttpWebRequest request = (HttpWebRequest) WebRequest. Create (Url );
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Stream stream = response. GetResponseStream ();
Long size = response. ContentLength;
// Create a file stream object
Using (FileStream fs = new FileStream (Path, FileMode. OpenOrCreate, FileAccess. Write ))
{
Byte [] B = new byte [1025];
Int n = 0;
While (n = stream. Read (B, 0, 1024)> 0)
{
Fs. Write (B, 0, n );
}
}
}

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.