Class for capturing and analyzing Web pages.
Main functions:
1. Extract the plain text of the web page and use all html tags and javascript code.
2. Extract links to webpages, including href, frame, and iframe
3. Extract the title of a webpage (other tags can be pushed based on this type, and the regular expression is the same)
4. Simple Form submission and cookie Storage
/*
* Author: Sunjoy at CCNU
* If you have improved this class, please send me a code (ccnusjy in gmail.com)
*/
UsingSystem;
UsingSystem. Data;
UsingSystem. Configuration;
UsingSystem. Net;
UsingSystem. IO;
UsingSystem. Text;
UsingSystem. Collections. Generic;
UsingSystem. Text. RegularExpressions;
UsingSystem. Threading;
UsingSystem. Web;
/// <Summary>
/// Webpage
/// </Summary>
Public ClassWebPage
{
# Region private member
PrivateUri m_uri;// URL
PrivateList <Link> m_links;// Link on this webpage
Private StringM_title;// The title of the webpage
Private StringM_html;// HTML code of the webpage
Private StringM_outstr;// Plain text that can be output from this webpage
Private BoolM_good;// Whether the webpage is available
Private IntM_pagesize;// Size of the webpage
Private StaticDictionary <String, CookieContainer> webcookies =NewDictionary <String, CookieContainer> ();// Store the cookies of all webpages
Private StringM_post;// POST data required for the login page of this webpage
Private StringM_loginurl;// Login page of this webpage
# Endregion
# Region private Method
/// <Summary>
/// This private method analyzes the link information from the HTML code of the webpage.
/// </Summary>
/// <Returns> List <Link> </returns>
PrivateList <Link> getLinks ()
{
If(M_links.Count = 0)
{
Regex [] regex =NewRegex [2];
Regex [0] =NewRegex ("(? M) <a [^> <] + href = ("| )? (? <Url> ([^> "\ s)]) +) (" | )? [^>] *> (? <Text> (\ w | \ W )*?) </", RegexOptions. Multiline | RegexOptions. IgnoreCase );
Regex [1] =NewRegex ("<[I] * frame [^> <] + src = (" | )? (? <Url> ([^> "\ s)]) +) (" | )? [^>] *>", RegexOptions. Multiline | RegexOptions. IgnoreCase );
For(IntI = 0; I <2; I ++)
{
Match match = regex [I]. Match (m_html );
While(Match. Success)
{
Try
{
StringUrl =NewUri (m_uri, match. Groups ["Url"]. Value). AbsoluteUri;
StringText ="";
If(I = 0) text =NewRegex ("(<[^>] +>) | (\ S) | (& nbsp;) | & | "", RegexOptions. Multiline | RegexOptions. IgnoreCase). Replace (match. Groups ["Text"]. Value,"");
Link link =NewLink (url, text );
M_links.Add (link );
}
Catch(Exception ex) {Console. WriteLine (ex. Message );};
Match = match. NextMatch ();
}
}
}
ReturnM_links;
}
/// <Summary>
/// This private method extracts a certain number of words of plain text from a piece of HTML text
/// </Summary>
/// <Param name = "instr"> HTML code </param>
/// <Param name = "firstN"> Number of words extracted from the header </param>
/// <Param name = "withLink"> whether to link the text </param>
/// <Returns> plain text </re