Use regular to match page href and src according to crawled pages.
stringUserAgent ="mozilla/5.0 (Windows NT 5.2; rv:29.0) gecko/20100101 firefox/29.0"; stringContentType =""; Uri Strrequrl=NewUri ("http://m.lhrb.ufstone.net/"); protected voidApplication_BeginRequest (Objectsender, EventArgs e) {Uri U=NewUri (Strrequrl, Request.rawurl); byte[] B =getverificationcode (U); //MemoryStream ms = new MemoryStream (b); //response.clearcontent (); //Response.ContentType = ContentType; //Response.BinaryWrite (b);StringBuilder strhtml=NewStringBuilder (Encoding.GetEncoding ("gb2312"). GetString (b)); StringBuilder SB=NewStringBuilder (); Gethtmlurl (refstrhtml); Response.Write (Strhtml.tostring ()); Response.End (); } Public byte[] getverificationcode (Uri URL) {WebClient mywebclient=NewWebClient (); MYWEBCLIENT.HEADERS.ADD ("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); MYWEBCLIENT.HEADERS.ADD ("Accept-language","zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); MYWEBCLIENT.HEADERS.ADD ("user-agent", This. useragent); Mywebclient.credentials=CredentialCache.DefaultCredentials; Try{byte[] pagedata=mywebclient.downloaddata (URL. Absoluteuri); ContentType= mywebclient.responseheaders["Content-type"]; return(Pagedata); } Catch { return NULL; } }View Code
voidGethtmlurl (refStringBuilder strhtml) { //string headstr = "(src|href) =", Endstr = "(\") "; //string reg = @ "(? <=" + Headstr + ") (. *?) (? = "+ Endstr +") "; stringReg ="(Src|href) \\s*=\\s* (?: \ " (? <1>[^\ "]*) \" | (? <1>\\s+))"; Regex R=NewRegex (Reg, Regexoptions.none); Match Match=R.match (strhtml.tostring ()); StringBuilder SB=NewStringBuilder (); while(match. Success) {//sb. Append (match. groups["url"]. Value + "\ n");//get the href value//sb. Append (match. groups["Text"]. Value + "\ n");//get <a><a/> content in the middlesb. Append (Match+"\ n");//get the href valueMatch =match. NextMatch (); //Try//{ //uri u = new uri (Strrequrl, match. Value.replace ("\" "," "). Replace ("'", "")); //Strhtml.replace (match. Value, @ "/" + u.tostring (). Replace (Strrequrl.tostring (), "")); //} //Catch//{ //} } }