Note: Image Acquisition in CSS is not implemented, and the regular expression of the image remains to be improved.
CopyCode The Code is as follows: using system;
Using system. Data;
Using system. configuration;
Using system. Web;
Using system. Web. Security;
Using system. Web. UI;
Using system. Web. UI. webcontrols;
Using system. Web. UI. webcontrols. webparts;
Using system. Web. UI. htmlcontrols;
// Introduce the space
Using system. net;
Using system. IO;
Using system. text;
Using system. Text. regularexpressions;
Using system. collections;
/// <Summary>
/// Collection
/// </Summary>
Public class caiji
{
Public caiji ()
{
//
// Todo: add the constructor logic here
//
}
/// <Summary>
/// Connection address of the webpage to be collected
/// </Summary>
/// <Param name = "url"> URL </param>
/// <Returns> </returns>
Public static string caijibyurl (string URL, string chargest, string path)
{
String STR = getsourcetextbyurl (URL, chargest );
Arraylist Lib = new arraylist ();
Int I = 0;
// Obtain the website domain name based on the URL
Uri uri = new uri (URL );
// Scheme or protocol, generally HTTP, and host is used to obtain the Domain Name
String baseurl = URI. scheme + ": //" + URI. Host + "/";
// Extracts the URL, including SRC and other information.
// \ S match any non-blank characters
RegEx G = new RegEx (@ "(src = (" "| \ ') \ s + \. (GIF | JPG | PNG | BMP) ("| \ ')", regexoptions. multiline | regexoptions. ignorecase );
Matchcollection M = G. Matches (STR );
Foreach (match math in m)
{
// The image path has been extracted, but it must be divided into absolute paths, relative paths, and suffix names, because it may be. asp or. aspx, such as the verification code image.
String imgurl = math. Groups [0]. value. tolower (); // convert it to lowercase, and there may be spaces between =
// Remove SRC, single quotes, and double quotation marks
Imgurl = imgurl. Replace ("src ","");
Imgurl = imgurl. Replace ("\"","");
Imgurl = imgurl. Replace ("'","");
Imgurl = imgurl. Replace ("= ","");
Imgurl = imgurl. Trim ();
// Path Processing
If (imgurl. substring (0, 4 )! = "HTTP ")
{
// Determine whether the path is absolute or relative.
If (imgurl. substring (0, 1) = "/")
{
Imgurl = baseurl + imgurl;
}
Else
{
Imgurl = URL. substring (0, URL. lastindexof ("/") + 1) + imgurl;
}
}
// Determine whether the element already exists.-1 indicates that it does not exist.
If (Lib. indexof (imgurl) =-1)
{
Lib. Add (imgurl );
}
}
String STR _ = string. empty;
WebClient client = new WebClient ();
For (Int J = 0; j <Lib. Count; j ++)
{
String savepath = path + datetime. now. month + datetime. now. day + datetime. now. minute + datetime. now. second + J + lib [J]. tostring (). substring (LIB [J]. tostring (). length)-4, 4 );
Try
{
Client. downloadfile (New Uri (LIB [J]. tostring (), savepath );
STR _ + = lib [J]. tostring () + "<br/> Save path:" + savepath + "<br/> ";
}
Catch (exception E)
{
STR _ + = E. message;
}
}
Return STR _;
}
Public static string getsourcetextbyurl (string URL, string chargest)
{
Webrequest request = webrequest. Create (URL );
Request. Timeout = 20000; // timeout in 20 seconds
Webresponse response = request. getresponse ();
Stream resstream = response. getresponsestream ();
Streamreader sr = new streamreader (resstream, encoding. getencoding (chargest ));
Return Sr. readtoend ();
}
}
usage: for example, save it to the upload Folder: copy Code the code is as follows: String Path = server. mappath ("~ /Upload/");
response. Write (caiji. caijibyurl (http://www.jb51.net," UTF-8 ", PATH);