I am here.
First, mshtml is very useful for parsing HTML elements. For example:
Using mshtml;
Private string converttoabsoluteurls (string HTML, Uri relativelocation, htmltag _ htmltag)
{
Ihtmldocument2 Doc = new htmldocumentclass ();
Doc. Write (new object [] {HTML });
Doc. Close ();
String B = string. empty;
If (_ htmltag = htmltag. Link)
{
Foreach (ihtmlanchorelement anchor in Doc. Links)
{
Ihtmlelement element = (ihtmlelement) anchor;
String href = (string) element. getattribute ("href", 2 );
If (href! = NULL)
{
Uri ADDR = new uri (relativelocation, href );
Anchor. href = ADDR. absoluteuri;
}
}
}
Else
{
Foreach (ihtmlimgelement image in Doc. Images)
{
Ihtmlelement element = (ihtmlelement) image;
String src = (string) element. getattribute ("src", 2 );
If (SRC! = NULL)
{
Uri ADDR = new uri (relativelocation, Src );
Image. src = ADDR. absoluteuri;
}
}
}
Return Doc. Body. innerhtml;
}
-----------------------------------------
In fact, there are many Programs and there are also many things to note.
1, pay attention to the content format. Some classes are formatted, such as line breaks and indentation. For such content, you generally need to
,
,
and other HTML elements are captured together and saved to the database. 2. Download the image to your local device and verify the image format with regular expressions. below is my own class for capturing online news. |
C # code
Struct resinfo
{
/// <Summary>
/// Complete original path
/// </Summary>
Public String orgurl;
/// <Summary>
/// Original file name
/// </Summary>
Public String orgname;
/// <Summary>
/// Original extension file name
/// </Summary>
Public String extname;
/// <Summary>
/// New file name
/// </Summary>
Public String newname;
}
/// <Summary>
/// Obtain the remote resources in the (webpage) Content
/// </Summary>
Public class remoteresource
{
Private int seriesnum;
Private string filenum;
Private string restype = ". gif |. jpg |. BMP |. PNG |. jpeg ";
Private string _ remoteurl;
Private string _ localurl;
Private string _ localpath;
Private string _ content = "";
Private bool _ rename;
Private bool bcomp = false;
/// <Summary>
/// Constructor
/// </Summary>
/// <Param name = "content"> contains the content for obtaining remote resources </param>
/// <Param name = "localurldirectory"> to save the file to the virtual directory of the local server, replace the original remote link address, for example, http://www.com2000888.com/remoteres. it can be empty or ./a. </Param>
/// <Param name = "localphysicaldirectory"> Save the file to the disk path of the local server, for example, C: \ Inetpub \ wwwroot \ remoteres, you can create a file if it does not exist. </param>
/// <Param name = "remoteurl"> used to process relative paths (such as src = ".. /images/com2000888.gif "). If this parameter is set to null, only the resources in the full path are obtained ): // start with </param>
/// <Param name = "renamefile"> whether to rename the resource file. If it is false, the system automatically overwrites the duplicate file. </param>
Public remoteresource (string content, string localurldirectory, string localphysicaldirectory, string remoteurl, bool renamefile)
{
_ Content = content;
_ Localurl = localurldirectory. Trim ();
_ Localpath = localphysicaldirectory. Trim ();
If (remoteurl = NULL)
_ Remoteurl = "";
Else
_ Remoteurl = remoteurl. Trim ();
If (_ remoteurl. Equals (""))
Bcomp = true;
If (_ localpath. Equals (""))
Throw new nullreferenceexception ("the local physical path cannot be blank! ");
_ Rename = renamefile;
Seriesnum = 1;
// Filenum = com2000888.common. Rand. Number (6 );
_ Localpath = _ localpath. Replace ("/","\\");
_ Localurl = _ localurl. Replace ("\\","/");
_ Remoteurl = _ remoteurl. Replace ("\\","/");
_ Localpath = _ localpath. trimend ('\\');
_ Localurl = _ localurl. trimend ('/');
If (! Directory. exists (_ localpath ))
Directory. createdirectory (_ localpath );
}
/// <Summary>
/// The extension of the resource file to be obtained. Do not add the extension (.), for example, {"GIF", "jpg", "PNG"}, default download files include GIF, JPG, BMP, and PNG.
/// </Summary>
Public String [] fileextends
{
Set
{
Restype = "";
String [] flexs = value;
For (INT I = 0; I <flexs. length; I ++)
{
If (I> 0)
Restype + = "| ";
Restype + = "." + flexs [I]. trimstart ('.');
}
}
}
/// <Summary>
/// Obtain the path of the Remote resource
/// </Summary>
Private ilist <resinfo> obtainresurl ()
{
Ilist <resinfo> List = new list <resinfo> ();
String Pattern = "src \ s? =\\ S? ['\ "]? (? <Resurl>. +? ("+ Restype. Replace (". "," \. ") + "))";
// String pattern = "[=\\ (] ['\" \]? (? <Resurl> [^ <> \ "] +? ("+ Restype. Replace (". "," \. ") + "))";
If (bcomp)
Pattern = @ "(HTTP | HTTPS | FTP | RTSP | MMS): // \ s + (" + restype. replace (". ","\\. ") + ")";
RegEx Reg = new RegEx (pattern, regexoptions. Compiled | regexoptions. ignorecase );
Match m = reg. Match (_ content );
While (M. Success)
{
String url = "";
If (bcomp)
{
Url = M. value;
}
Else
{
Url = M. Groups ["resurl"]. value;
}
Bool bsame = false;
Foreach (resinfo res in List)
{
If (res. orgurl. Equals (URL ))
{
Bsame = true;
Break;
}
}
If (! Bsame)
{
# Adding region to the resource list
String name = "";
String curl = URL. Replace ("\", "/"). Trim ();
If (curl. indexof ("/")> = 0)
{
Name = curl. substring (curl. lastindexof ("/") + 1 );
}
Else
{
Name = URL;
}
Int Pos = Name. lastindexof (".");
Resinfo R;
R. orgurl = URL;
R. orgname = Name. substring (0, POS );
R. extname = Name. substring (Pos + 1 );
R. newname = "";
List. Add (R );
# Adding endregion to the resource list
}
M = M. nextmatch ();
}
Return list;
}
/// <Summary>
/// Save the remote image and replace the original content
/// </Summary>
Public void fetchresource ()
{
WebClient WB = new WebClient ();
Ilist <resinfo> List = obtainresurl ();
If (! _ Localurl. Equals (""))
_ Localurl + = "/";
Foreach (resinfo R in List)
{
Try
{
String url = utilitypage. stickurl (_ remoteurl, R. orgurl );
String newurl = "", newpath = "";
If (_ rename)
{
# Region generate a new file name
String newname = filenum + seriesnum. tostring (). padleft (3, '0') + "." + R. extname;
While (file. exists (_ localpath + "\" + newname ))
{
Seriesnum ++;
Newname = filenum + seriesnum. tostring (). padleft (3, '0') + "." + R. extname;
}
Newpath = _ localpath + "\" + newname;
Newurl = _ localurl + newname;
WB. downloadfile (URL, newpath );
# Endregion
}
Else
{
Newurl = _ localurl + R. orgname + "." + R. extname;
WB. downloadfile (URL, _ localpath + "\" + R. orgname + "." + R. extname );
}
# Replacing the file name with Region
_ Content = _ content. Replace (R. orgurl, newurl );
# Replacing the file name with endregion
Seriesnum ++;
}
Catch
{}
}
If (WB! = NULL)
WB. Dispose ();
}
/// <Summary>
/// Obtain the content
/// </Summary>
Public String content
{
Get {return _ content ;}
}
}
Change the relative link in HTML to absolute link.
C # code
Static Public String formaturlinhtml_new (URI bsurl, string strhtml)
{
String PTN = "<[\ s] [^>] * (\ SSRC = |\\ svalue = |\\ shref =) ('| \")? ([^> \ "'\ S] * \. (GIF | JPG | BMP | JPEG | PSD | PNG | SVG | DXF | WMF | tiff | SWF) ('| \")? [\ S] [^>] *> ";
foreach (match in RegEx. matches (strhtml, PTN, regexoptions. ignorecase)
{< br> string text2 = match. groups [0]. value; // original link
string text1 = match. groups [3]. value;
uri addr = new uri (bsurl, text1);
String text3 = text2.replace (text1, ADDR. absoluteuri); // new link
Strhtml = strhtml. Replace (text2, text3 );
}
Return strhtml;
}
here I provide a class dedicated to page parsing, winista. htmlparser. DLL
http://topic.csdn.net/u/20100504/15/379579b9-7cf0-4400-8fdb-995a644f7917.html? 26912