Get the page you want to crawl
Const string URL = "Http://www.hn3ddf.gov.cn/price/GetList.html?pageno=1";
string htmlstr = null;
for (int i = 0; i <; i++)
{
Try
{
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest) System.Net.WebRequest.Create (URL);
Request. Headers.set ("Pragma", "No-cache");
Request. Timeout = 10000 + (i * 5000);
System.Net.HttpWebResponse response = (System.Net.HttpWebResponse) request. GetResponse ();
System.IO.Stream streamreceive = Response. GetResponseStream ();
System.IO.StreamReader StreamReader = new System.IO.StreamReader (streamreceive, encoding.getencoding ("Utf-8"));
Htmlstr = Streamreader.readtoend ();
Break
}
catch (Exception e)
{
----------------Crawl Exception!!
}
}
Crawl a specific line attribute in a UL tag in a page
MatchCollection pricelist = regex.matches (Htmlstr, @ "<ul style=" "font-size:12px;width:320px; margin:0; padding:0;" " > (. *?) </ul> ", regexoptions.singleline); StringBuilder resultstr = new StringBuilder ();
for (int i = 0; i < Pricelist.count; i++)
{
Try
{
<ul style= "font-size:12px;width:320px; margin:0; padding:0; " >
<li style= "color: #555555; Float:left; Display:block; width:140px; height:22px; line-height:22px; "align=" center "> Zuya 嬮 float yan indent loosen 楗 fork Masu </li>
<li align= "center" style= "color: #555555; Float:left; Display:block; width:100px; height:22px; line-height:22px; " >2.83 å? å nakatomi toilets </li>
<li style= "color: #555555; Float:left; Display:block; Width:50px;text-align:center; height:22px; line-height:22px; " >05-21</li>
</ul>
list<string> list = new list<string> (); Generic collection of results to be released
String splitstr = "</li>";
string[] Strarray = Pricelist[i]. Value.split (Splitstr.toarray ()); A group of Li tags
foreach (string item in Strarray)
//{
int first = Item. IndexOf (' > ');
int last = Item. IndexOf ("</li>");
List. ADD (item. Substring (First, last-first));
List.add (Item.substring (Item.indexof (">"));
//}
MatchCollection items = regex.matches (Htmlstr, @ "<li.* (?=>) (. | \ n) *?</li> ");
Resultstr.append ("<tr>");
<li style= "color: #555555; Float:left; Display:block; width:140px; height:22px; line-height:22px; "align=" center "> Layer Feed </li>
<ul style= "font-size:12px;width:320px; margin:0; padding:0; " >
<li style= "color: #555555; Float:left; Display:block; width:140px; height:22px; line-height:22px; "align=" center "> Layer Feed </li>
<li align= "center" style= "color: #555555; Float:left; Display:block; width:100px; height:22px; line-height:22px; " >2.83 RMB/kg </li>
<li style= "color: #555555; Float:left; Display:block; Width:50px;text-align:center; height:22px; line-height:22px; " >05-21</li>
</ul>
String priceitem = Pricelist[i]. Value;
String name = Regex.match (Priceitem, @ "<li style=" "Color: #555555; float:left; display:block; width:140px; height : 22px; line-height:22px, "align=" "Center" > (. *?) </li> "). value;//all attributes in the LI tag with < Beginning in the crawled Web page are equipped with a true line of results containing: styles and values
Match Titlematch = Regex.match (Priceitem, @ "<li style=" "Color: #555555; float:left; display:block; width:140px; height:22px; line-height:22px, "align=" "Center" "> ([^<]*) </li>", Regexoptions.ignorecase | Regexoptions.multiline);
Take the value of only the attribute in the row above value.groups[1],1 representsThe index of the groups obtained by the Regex.match method is starting at 1, not starting from 0
String name = Titlematch.groups[1]. Value;
"Color: #555555; Float:left; Display:block; width:140px; height:22px; line-height:22px; "align=" center "> Zuya 嬮 float yan indent loosen 楗 fork Masu
Name = name. Substring (Ten, name). LENGTH-15);
Name = name. Substring (113, name. Length-118);
String price = Regex.match (Priceitem, @ "<li align=" "Center" "style=" "Color: #555555; float:left; display:block; width:100px; height:22px; line-height:22px; "" > (. *?) </li> "). Value;
Price = Price. Substring, Price. LENGTH-18);
Price = Price. Substring (+, price. LENGTH-120);
Match Pricematch = Regex.match (Priceitem, @ "<li align=" "Center" "style=" "Color: #555555; float:left; display:block; width:100px; height:22px; line-height:22px; "" > ([^<]*) </li> ", Regexoptions.ignorecase | Regexoptions.multiline);
String price = Pricematch.groups[1]. Value;
String weeks = Regex.match (Priceitem, @ "<li style=" "Color: #555555; float:left; display:block; wid Th:50px;text-align:center; height:22px; line-height:22px; "" > (. *?) </li>
//"). Value;
Weeks = weeks. Substring (9, weeks. LENGTH-16);
Weeks = weeks. Substring (weeks. Length-122);
Match Weeksmatch = Regex.match (Priceitem, @ "<li style=" "Color: #555555; float:left; display:block; width:50px; Text-align:center; height:22px; line-height:22px; "" > ([^<]*) </li> ", Regexoptions.ignorecase | Regexoptions.multiline);
String weeks = weeksmatch.groups[1]. Value;
Resultstr.append ("<td width=\" 195\ "height=\" 25\ "align=\" left\ ">" + name + "</td><td width=\" 70\ " Height=\ "25\" align=\ "Center\" style=\ "text-align:right;\" > "+ Price +" </td><td height=\ "25\" align=\ " Center\ "style=\" color: #55a8ea; \ ">" + weeks + "</td>");
Resultstr.append ("</tr>");
#region the original
Resultstr.append ("<tr>");
String priceitem = Pricelist[i]. Value;
String name = Regex.match (Priceitem, "width=125>.*?</td>"). Value;
Name = name. Substring (Ten, name). LENGTH-15);
String price = Regex.match (Priceitem, "<td width=50.*?</td>"). Value;
Price = Price. Substring, Price. LENGTH-18);
String weeks = Regex.match (Priceitem, "class=en>.*?</font>"). Value;
Weeks = weeks. Substring (9, weeks. LENGTH-16);
Resultstr.append ("<td width=\" 195\ "height=\" 25\ "align=\" left\ ">" + name + "</td><td width=\" 70\ " Height=\ "25\" align=\ "Center\" > "+ Price +" </td><td height=\ "25\" align=\ "center\" style=\ "COLOR: #55a8ea ; \ ">" + weeks + "</td>");
Resultstr.append ("</tr>");
#endregion
}
catch (Exception ex)
{
Common.Log4netUtil.Log (). Error ("Get cross-domain data errors." + ex.) Message);
}
}
return resultstr.tostring ();