Through the software a day or two to complete the thousands of product data entry, can be seen that a lot of work is not blindly to do, as a programmer, is to make a lot of those who often do repetitive, tedious work of people liberated. The following just wrote some core code, and the collection must be linked to the corresponding site, author: Zheng
Copy Code code as follows:
A page that extracts the product's final page from the Product List page
private void Button1_Click (object sender, EventArgs e)
{
if (textBox1.Text.Trim () = = "" | | TextBox2.Text.Trim () = = "")
{
MessageBox.Show ("url and domain name can not be empty!") "," Information Tips ", MessageBoxButtons.OK, MessageBoxIcon.Information);
Return
}
Try
{
String Html = Inc. Gethtml ("http://study.pctoday.net.cn");
ArrayList al = Inc. Getmatchesstr (Html, "<a[^>]*?>.*?</a>");
ArrayList al = Inc. GETMATCHESSTR (Html, @ "href\s*=\s*" (?: [\ ' \ "" \s] (? <1>[^\ "" \ ']*) [\ ' "])")//Extract link
"Title=" Replica Watches: ">replica Watches buy full Quality popular luxury to Watches at amazing Price, Your one Stop Disc Ount Swiss Watches storeexclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online sale!
StringBuilder sb = new StringBuilder ();
foreach (object var in al)
{
String A = var. ToString (). Replace ("\", ""). Replace ("'", "");
A = Regex.Replace (A, "href=", "" ", Regexoptions.ignorecase | Regexoptions.multiline);
if (A.startswith ("/"))
A = TextBox2.Text.Trim () + A;
if (!a.startswith ("http://"))
A = "http://" + A;
Sb. Append (A + "\ r \ n");
}
Textbox5.text = sb. ToString ()//To Output to a textbox, one row for each link
MessageBox.Show ("Total extract" + al. Count.tostring () + "A link", "Information hint", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
catch (Exception err)
{
MessageBox.Show ("Fetch error!") Reason: "+ err. message, "Information hint", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
}
The collection of the product page HTML code processing, extraction of the required code, and finally saved to a local access database, while extracting the product picture address and automatically now picture to the local images folder
private void Backgroundworker1_dowork (object sender, DoWorkEventArgs e)
{
Fill Product Table
Database.executenonquery ("Delete from tb_product");
DataTable DT2 = new DataTable ();
OleDbConnection conn = new OleDbConnection (database.connectionstrings);
OleDbDataAdapter da = new OleDbDataAdapter ("SELECT * from Tb_product", conn);
OleDbCommandBuilder cb = new OleDbCommandBuilder (DA);
Da. Fill (DT2);
DT2. Rows.clear ();
BackgroundWorker worker = (BackgroundWorker) sender;//This is to make a progress bar
string[] Urls = TextBox5.Text.Trim (). ToLower (). Replace ("\ r \ n", ","). Split (', ');
DataTable dt = new DataTable ();
StringBuilder errorstr = new StringBuilder ();
String html = "", Imagedir = AppDomain.CurrentDomain.BaseDirectory + "images\\";
Loop every time you collect the URL
for (int i = 0; i < urls.length; i++)
{
Try
{
if (!worker. cancellationpending)
{
if (urls[i] = = "")
Return
HTML = inc. Gethtml (Urls[i]);//Get the HTML code for the URL
DataRow NewRow = DT2. NewRow ();
Product Name
String ProductName = html. Substring (HTML. IndexOf ("<title>") + 7);
newrow["ProductName"] = Productname.remove (Productname.indexof ("</title>")). Trim ();
Product number
newrow["ModelID"] = newrow["ProductName"]. ToString (). Substring (newrow["ProductName"). ToString (). IndexOf ("Model:") + 6). Trim ();
Product introduction, these are based on the HTML of different sites to make the corresponding changes
string introduce = HTML. Substring (HTML. IndexOf ("Product Details") + 26);
introduce = Introduce.remove (Introduce.indexof ("</table>") + 8). Trim ()
newrow["introduce"] = introduce;
"Title=" Replica Watches: ">replica Watches buy full Quality popular luxury to Watches at amazing Price, Your one Stop Disc Ount Swiss Watches storeexclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online sale!
Download pictures
String productimage = html. Substring (HTML. IndexOf ("align=center>Productimage = TextBox2.Text.Trim () + productimage.substring (Productimage.indexof ("src=\") + 5);
Productimage = Productimage.remove (Productimage.indexof ("\"));
Try
{
Inc. Downfile (productimage, Imagedir + productimage.substring (productimage.lastindexof ("/") + 1));
}
catch (Exception)
{
Errorstr.append ("Download picture failed, picture address:" + Imagedir + productimage.substring (productimage.lastindexof ("/") + 1) + "\ r \ n");
}
DT2. Rows.Add (NewRow);
Thread.Sleep (100);
Worker. ReportProgress ((i + 1) * 100/urls.length, I);
Toolstripstatuslabel1.text = "Process progress:" + (i + 1). ToString () + "/" + Urls.Length.ToString ()//progress bar
}
}
catch (Exception err)
{
Errorstr.append ("Collection error:" + Err.) Message + "; URL:" + urls[i] + "\ r \ n");
}
}
Da. Update (DT2);
DataBind (DT2);
ShowError (Errorstr.tostring ());
}
<summary>
ASPX page generates a static HTML page, author: Zheng
</summary>
public static string gethtml (string url)
{
StreamReader sr = null;
string str = NULL;
Read remote path
WebRequest request = webrequest.create (URL);
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
sr = new StreamReader (response. GetResponseStream (), encoding.getencoding (response. CharacterSet));
str = Sr. ReadToEnd ();
Sr. Close ();
return str;
}
Extract URLs from HTML code
public static ArrayList Getmatchesstr (String htmlcode, String Strregex)
{
ArrayList al = new ArrayList ();
Regex r = new Regex (Strregex, Regexoptions.ignorecase | Regexoptions.multiline);
MatchCollection m = r.matches (Htmlcode);
for (int i = 0; i < M.count; i++)
{
BOOL rep = false;
String strnew = M[i]. ToString ();
Filter for duplicate URLs
foreach (String str in AL)
{
if (strnew = = str)
{
Rep = true;
Break
}
}
if (!rep) al. ADD (strnew);
}
Al. Sort ();
Return al;
}
public static void Downfile (String Url, String Path)
{
HttpWebRequest request = (HttpWebRequest) webrequest.create (URL);
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Stream stream = Response. GetResponseStream ();
Long size = Response. ContentLength;
Create a file Stream object
using (FileStream fs = new FileStream (Path, FileMode.OpenOrCreate, FileAccess.Write))
{
Byte[] B = new byte[1025];
int n = 0;
while (n = stream. Read (b, 0, 1024)) > 0)
{
Fs. Write (b, 0, N);
}
}
}