C # webpage crawling and Analysis

Source: Internet
Author: User

Class for capturing and analyzing Web pages.

Main functions:

1. Extract the plain text of the web page and use all html tags and javascript code.

2. Extract links to webpages, including href, frame, and iframe

3. Extract the title of a webpage (other tags can be pushed based on this type, and the regular expression is the same)

4. Simple Form submission and cookie Storage

/*
* Author: Sunjoy at CCNU
* If you have improved this class, please send me a code (ccnusjy in gmail.com)
*/


UsingSystem;
UsingSystem. Data;
UsingSystem. Configuration;
UsingSystem. Net;
UsingSystem. IO;
UsingSystem. Text;
UsingSystem. Collections. Generic;
UsingSystem. Text. RegularExpressions;
UsingSystem. Threading;
UsingSystem. Web;
/// <Summary>
/// Webpage
/// </Summary>
Public ClassWebPage
{

# Region private member
PrivateUri m_uri;// URL
PrivateList <Link> m_links;// Link on this webpage
Private StringM_title;// The title of the webpage
Private StringM_html;// HTML code of the webpage
Private StringM_outstr;// Plain text that can be output from this webpage
Private BoolM_good;// Whether the webpage is available
Private IntM_pagesize;// Size of the webpage
Private StaticDictionary <String, CookieContainer> webcookies =NewDictionary <String, CookieContainer> ();// Store the cookies of all webpages
Private StringM_post;// POST data required for the login page of this webpage
Private StringM_loginurl;// Login page of this webpage
# Endregion


# Region private Method
/// <Summary>
/// This private method analyzes the link information from the HTML code of the webpage.
/// </Summary>
/// <Returns> List <Link> </returns>
PrivateList <Link> getLinks ()
{
If(M_links.Count = 0)
{
Regex [] regex =NewRegex [2];
Regex [0] =NewRegex ("(? M) <a [^> <] + href = ("| )? (? <Url> ([^> "\ s)]) +) (" | )? [^>] *> (? <Text> (\ w | \ W )*?) </", RegexOptions. Multiline | RegexOptions. IgnoreCase );
Regex [1] =NewRegex ("<[I] * frame [^> <] + src = (" | )? (? <Url> ([^> "\ s)]) +) (" | )? [^>] *>", RegexOptions. Multiline | RegexOptions. IgnoreCase );
For(IntI = 0; I <2; I ++)
{
Match match = regex [I]. Match (m_html );
While(Match. Success)
{
Try
{
StringUrl =NewUri (m_uri, match. Groups ["Url"]. Value). AbsoluteUri;
StringText ="";
If(I = 0) text =NewRegex ("(<[^>] +>) | (\ S) | (& nbsp;) | & | "", RegexOptions. Multiline | RegexOptions. IgnoreCase). Replace (match. Groups ["Text"]. Value,"");
Link link =NewLink (url, text );
M_links.Add (link );
}
Catch(Exception ex) {Console. WriteLine (ex. Message );};
Match = match. NextMatch ();
}
}
}
ReturnM_links;
}

/// <Summary>
/// This private method extracts a certain number of words of plain text from a piece of HTML text
/// </Summary>
/// <Param name = "instr"> HTML code </param>
/// <Param name = "firstN"> Number of words extracted from the header </param>
/// <Param name = "withLink"> whether to link the text </param>
/// <Returns> plain text </re

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.