Asp.net (c #) as a web data collection tool

Last Update:2013-10-17 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

This software has been used to input thousands of product data in one or two days. It can be seen that a lot of work is not done manually. As a programmer, it is to free up many people who often do repetitive and tedious work. The following is just some core code, and the collection must be linked to the corresponding website, Author: Zheng shaoqun

Copy codeThe Code is as follows:
// Extract the webpage on the final page of the product on the product list page
Private void button#click (object sender, EventArgs e)
{
If (textBox1.Text. Trim () = "" | textBox2.Text. Trim () = "")
{
MessageBox. Show ("the URL and domain name cannot be blank! "," Message prompt ", MessageBoxButtons. OK, MessageBoxIcon. Information );
Return;
}
Try
{
String Html = inc. GetHtml ("http://study.pctoday.net.cn ");
// ArrayList al = inc. GetMatchesStr (Html, "<a [^>] *?>. *? </A> ");
ArrayList al = inc. GetMatchesStr (Html, @ "href \ s * = \ s *(? : [\ '\ "" \ S] (? <1> [^ \ "" \ '] *) [\' \ ""]); // extract the link

"Title =" Replica Watches: "> Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, cartier Watches online Sale!
StringBuilder sb = new StringBuilder ();
Foreach (object var in al)
{
String a = var. ToString (). Replace ("\" "," "). Replace ("'","");
A = Regex. Replace (a, "href =", "", RegexOptions. IgnoreCase | RegexOptions. Multiline );
If (a. StartsWith ("/"))
A = textBox2.Text. Trim () +;
If (! A. StartsWith ("http ://"))
A = "http: //" +;
Sb. Append (a + "\ r \ n ");
}
TextBox5.Text = sb. ToString (); // output the extracted URL to a textBox, where each link occupies one line.

MessageBox. Show ("Extracted" + al. Count. ToString () + "Link", "message prompt", MessageBoxButtons. OK, MessageBoxIcon. Information );

}
Catch (Exception err)
{
MessageBox. Show ("extraction error! Cause: "+ err. Message," Message prompt ", MessageBoxButtons. OK, MessageBoxIcon. Information );
}

}

// Process the html code of the collected product page as a string, extract the required code, and save it to a local access database, at the same time, extract the product image address and automatically upload the image to the local images folder.

Private void backgroundworker=dowork (object sender, DoWorkEventArgs e)
{
// Fill in the product table
Database. ExecuteNonQuery ("delete from Tb_Product ");
DataTable dt2 = new DataTable ();
OleDbConnection conn = new OleDbConnection (Database. ConnectionStrings );
OleDbDataAdapter da = new OleDbDataAdapter ("select * from Tb_Product", conn );
OleDbCommandBuilder cb = new OleDbCommandBuilder (da );
Da. Fill (dt2 );
Dt2.Rows. Clear ();

BackgroundWorker worker = (BackgroundWorker) sender; // This is a progress bar.

String [] Urls = textBox5.Text. Trim (). ToLower (). Replace ("\ r \ n", ","). Split (',');
DataTable dt = new DataTable ();
StringBuilder ErrorStr = new StringBuilder ();
String html = "", ImageDir = AppDomain. CurrentDomain. BaseDirectory + "Images \\";

// Collect the website every time
For (int I = 0; I <Urls. Length; I ++)
{
Try
{
If (! Worker. CancellationPending)
{
If (Urls [I] = "")
Return;
Html = inc. GetHtml (Urls [I]); // obtain the html code of the url
DataRow NewRow = dt2.NewRow ();

// Product Name
String ProductName = html. Substring (html. IndexOf ("<title>") + 7 );
NewRow ["ProductName"] = ProductName. Remove (ProductName. IndexOf ("</title>"). Trim ();

// Product NO.
NewRow ["ModelId"] = NewRow ["ProductName"]. toString (). substring (NewRow ["ProductName"]. toString (). indexOf ("Model:") + 6 ). trim ();

// Product introduction, which are modified based on the html of different websites
String Introduce = html. Substring (html. IndexOf ("Product Details") + 26 );
Introduce = Introduce. Remove (Introduce. IndexOf ("</table>") + 8). Trim ()

NewRow ["Introduce"] = Introduce;

"Title =" Replica Watches: "> Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, cartier Watches online Sale!
// Download the image
String ProductImage = html. Substring (html. IndexOf ("align = center> ProductImage = textBox2.Text. Trim () + ProductImage. Substring (ProductImage. IndexOf ("src = \" ") + 5 );
ProductImage = ProductImage. Remove (ProductImage. IndexOf ("\""));
Try
{
Inc. DownFile (ProductImage, ImageDir + ProductImage. Substring (ProductImage. LastIndexOf ("/") + 1 ));
}
Catch (Exception)
{
ErrorStr. Append ("image download failed, image address:" + ImageDir + ProductImage. Substring (ProductImage. LastIndexOf ("/") + 1) + "\ r \ n ");
}

Dt2.Rows. Add (NewRow );

// Thread. Sleep (100 );
Worker. ReportProgress (I + 1) * 100/Urls. Length, I );
ToolStripStatusLabel1.Text = "processing progress:" + (I + 1). ToString () + "/" + Urls. Length. ToString (); // progress bar
}

}
Catch (Exception err)
{
ErrorStr. Append ("collection error:" + err. Message + "; Url:" + Urls [I] + "\ r \ n ");
}
}
Da. Update (dt2 );
DataBind (dt2 );
ShowError (ErrorStr. ToString ());
}

/// <Summary>
/// Generate a static Html page on the ASPX page by Zheng shaoqun
/// </Summary>
Public static string GetHtml (string url)
{
StreamReader sr = null;
String str = null;
// Read the remote path
WebRequest request = WebRequest. Create (url );
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Sr = new StreamReader (response. GetResponseStream (), Encoding. GetEncoding (response. CharacterSet ));
Str = sr. ReadToEnd ();
Sr. Close ();
Return str;
}

// Extract the URL from the HTML code
Public static ArrayList GetMatchesStr (string htmlCode, string strRegex)
{
ArrayList al = new ArrayList ();

Regex r = new Regex (strRegex, RegexOptions. IgnoreCase | RegexOptions. Multiline );
MatchCollection m = r. Matches (htmlCode );

For (int I = 0; I <m. Count; I ++)
{
Bool rep = false;
String strNew = m [I]. ToString ();

// Filter duplicate URLs
Foreach (string str in al)
{
If (strNew = str)
{
Rep = true;
Break;
}
}

If (! Rep) al. Add (strNew );
}

Al. Sort ();

Return al;
}

Public static void DownFile (string Url, string Path)
{

HttpWebRequest request = (HttpWebRequest) WebRequest. Create (Url );
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Stream stream = response. GetResponseStream ();
Long size = response. ContentLength;
// Create a file stream object
Using (FileStream fs = new FileStream (Path, FileMode. OpenOrCreate, FileAccess. Write ))
{
Byte [] B = new byte [1025];
Int n = 0;
While (n = stream. Read (B, 0, 1024)> 0)
{
Fs. Write (B, 0, n );
}
}
}

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Asp.net (c #) as a web data collection tool

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Asp.net (c #) as a web data collection tool

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support