usually in the crawler or CMS when the need to extract the href link or src address. You can now use regular expressions to do it easily. Regex Reg=NewRegex (@"(? is) <a[^>]*?href= ([' ""]?) (? <url>[^ ' "" \s>]+) \1[^>]*> (?<text> (?:(?! </?a\b).) *) </a>"); MatchCollection MC=Reg. Matches (YOURSTR); foreach(Match minchMC) {Richtextbox2.text+ = m.groups["URL"]. Value +"\ n";//get the href valueRichtextbox2.text + = m.groups["text"]. Value +"\ n";//get <a><a/> content in the middle} Method 2:<preclass="Brush:c-sharp;">Regex R; Match m; R=NewRegex ("href\\s*=\\s* (?: \ " (? <1>[^\ "]*) \" | (? <1>\\s+))", Regexoptions.ignorecase|regexoptions.compiled); for(M = R.match (inputstring); m.success; m =M.nextmatch ()) {Console.WriteLine ("Found href"+ m.groups[1] +" at"+ m.groups[1]. Index);}</PRE>Method 3: Extract the img src<preclass="Brush:c-sharp;">regex reg =NewRegex (@"(? i) ]*?\ssrc\s*=\s* ([' ""]?) (? <src>[^ ' "" \s>]+) \1[^>]*>"); MatchCollection MC=Reg. Matches (YOURSTR); foreach(Match minchMC) {Console.Write (m.groups["src"]. Value +"\ n"); } </PRE>Method 4: Extract img src<preclass="Brush:c-sharp;">/// <summary> ///get the path to IMG/// </summary> /// <param name= "HTMLText" >HTML string Literals</param> /// <returns>return the picture path as an array</returns> Public Static string[] Gethtmlimageurllist (stringhtmltext) {Regex regimg=NewRegex (@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*["'"? [\s\t\r\n]* (? [^\s\t\r\n "" ' <>]* ") [^<>]*?/? [\s\t\r\n]*>", regexoptions.ignorecase); //create a new matches MatchCollection object to save the number of matching objects (img tag)MatchCollection matches =regimg.matches (HTMLText); inti =0; string[] Surllist =New string[matches. Count]; //iterate through all IMG Tag Objects foreach(Match matchinchmatches) { //gets the path of all IMG SRC, and saves it to the arraysurllist[i++] = match. groups["Imgurl"]. Value; } returnsurllist; }</PRE>
Centralized method for extracting hyperlink addresses using regular expressions in C #