This software has been used to input thousands of product data in one or two days. It can be seen that a lot of work is not done manually. As a programmer, it is to free up many people who often do repetitive and tedious work. The following is just some core code, and the collection must be linked to the corresponding website, Author: Zheng shaoqun
Copy codeThe Code is as follows:
// Extract the webpage on the final page of the product on the product list page
Private void button#click (object sender, EventArgs e)
{
If (textBox1.Text. Trim () = "" | textBox2.Text. Trim () = "")
{
MessageBox. Show ("the URL and domain name cannot be blank! "," Message prompt ", MessageBoxButtons. OK, MessageBoxIcon. Information );
Return;
}
Try
{
String Html = inc. GetHtml ("http://study.pctoday.net.cn ");
// ArrayList al = inc. GetMatchesStr (Html, "<a [^>] *?>. *? </A> ");
ArrayList al = inc. GetMatchesStr (Html, @ "href \ s * = \ s *(? : [\ '\ "" \ S] (? <1> [^ \ "" \ '] *) [\' \ ""]); // extract the link
"Title =" Replica Watches: "> Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, cartier Watches online Sale!
StringBuilder sb = new StringBuilder ();
Foreach (object var in al)
{
String a = var. ToString (). Replace ("\" "," "). Replace ("'","");
A = Regex. Replace (a, "href =", "", RegexOptions. IgnoreCase | RegexOptions. Multiline );
If (a. StartsWith ("/"))
A = textBox2.Text. Trim () +;
If (! A. StartsWith ("http ://"))
A = "http: //" +;
Sb. Append (a + "\ r \ n ");
}
TextBox5.Text = sb. ToString (); // output the extracted URL to a textBox, where each link occupies one line.
MessageBox. Show ("Extracted" + al. Count. ToString () + "Link", "message prompt", MessageBoxButtons. OK, MessageBoxIcon. Information );
}
Catch (Exception err)
{
MessageBox. Show ("extraction error! Cause: "+ err. Message," Message prompt ", MessageBoxButtons. OK, MessageBoxIcon. Information );
}
}
// Process the html code of the collected product page as a string, extract the required code, and save it to a local access database, at the same time, extract the product image address and automatically upload the image to the local images folder.
Private void backgroundworker=dowork (object sender, DoWorkEventArgs e)
{
// Fill in the product table
Database. ExecuteNonQuery ("delete from Tb_Product ");
DataTable dt2 = new DataTable ();
OleDbConnection conn = new OleDbConnection (Database. ConnectionStrings );
OleDbDataAdapter da = new OleDbDataAdapter ("select * from Tb_Product", conn );
OleDbCommandBuilder cb = new OleDbCommandBuilder (da );
Da. Fill (dt2 );
Dt2.Rows. Clear ();
BackgroundWorker worker = (BackgroundWorker) sender; // This is a progress bar.
String [] Urls = textBox5.Text. Trim (). ToLower (). Replace ("\ r \ n", ","). Split (',');
DataTable dt = new DataTable ();
StringBuilder ErrorStr = new StringBuilder ();
String html = "", ImageDir = AppDomain. CurrentDomain. BaseDirectory + "Images \\";
// Collect the website every time
For (int I = 0; I <Urls. Length; I ++)
{
Try
{
If (! Worker. CancellationPending)
{
If (Urls [I] = "")
Return;
Html = inc. GetHtml (Urls [I]); // obtain the html code of the url
DataRow NewRow = dt2.NewRow ();
// Product Name
String ProductName = html. Substring (html. IndexOf ("<title>") + 7 );
NewRow ["ProductName"] = ProductName. Remove (ProductName. IndexOf ("</title>"). Trim ();
// Product NO.
NewRow ["ModelId"] = NewRow ["ProductName"]. toString (). substring (NewRow ["ProductName"]. toString (). indexOf ("Model:") + 6 ). trim ();
// Product introduction, which are modified based on the html of different websites
String Introduce = html. Substring (html. IndexOf ("Product Details") + 26 );
Introduce = Introduce. Remove (Introduce. IndexOf ("</table>") + 8). Trim ()
NewRow ["Introduce"] = Introduce;
"Title =" Replica Watches: "> Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, cartier Watches online Sale!
// Download the image
String ProductImage = html. Substring (html. IndexOf ("align = center> ProductImage = textBox2.Text. Trim () + ProductImage. Substring (ProductImage. IndexOf ("src = \" ") + 5 );
ProductImage = ProductImage. Remove (ProductImage. IndexOf ("\""));
Try
{
Inc. DownFile (ProductImage, ImageDir + ProductImage. Substring (ProductImage. LastIndexOf ("/") + 1 ));
}
Catch (Exception)
{
ErrorStr. Append ("image download failed, image address:" + ImageDir + ProductImage. Substring (ProductImage. LastIndexOf ("/") + 1) + "\ r \ n ");
}
Dt2.Rows. Add (NewRow );
// Thread. Sleep (100 );
Worker. ReportProgress (I + 1) * 100/Urls. Length, I );
ToolStripStatusLabel1.Text = "processing progress:" + (I + 1). ToString () + "/" + Urls. Length. ToString (); // progress bar
}
}
Catch (Exception err)
{
ErrorStr. Append ("collection error:" + err. Message + "; Url:" + Urls [I] + "\ r \ n ");
}
}
Da. Update (dt2 );
DataBind (dt2 );
ShowError (ErrorStr. ToString ());
}
/// <Summary>
/// Generate a static Html page on the ASPX page by Zheng shaoqun
/// </Summary>
Public static string GetHtml (string url)
{
StreamReader sr = null;
String str = null;
// Read the remote path
WebRequest request = WebRequest. Create (url );
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Sr = new StreamReader (response. GetResponseStream (), Encoding. GetEncoding (response. CharacterSet ));
Str = sr. ReadToEnd ();
Sr. Close ();
Return str;
}
// Extract the URL from the HTML code
Public static ArrayList GetMatchesStr (string htmlCode, string strRegex)
{
ArrayList al = new ArrayList ();
Regex r = new Regex (strRegex, RegexOptions. IgnoreCase | RegexOptions. Multiline );
MatchCollection m = r. Matches (htmlCode );
For (int I = 0; I <m. Count; I ++)
{
Bool rep = false;
String strNew = m [I]. ToString ();
// Filter duplicate URLs
Foreach (string str in al)
{
If (strNew = str)
{
Rep = true;
Break;
}
}
If (! Rep) al. Add (strNew );
}
Al. Sort ();
Return al;
}
Public static void DownFile (string Url, string Path)
{
HttpWebRequest request = (HttpWebRequest) WebRequest. Create (Url );
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Stream stream = response. GetResponseStream ();
Long size = response. ContentLength;
// Create a file stream object
Using (FileStream fs = new FileStream (Path, FileMode. OpenOrCreate, FileAccess. Write ))
{
Byte [] B = new byte [1025];
Int n = 0;
While (n = stream. Read (B, 0, 1024)> 0)
{
Fs. Write (B, 0, n );
}
}
}