As we often need to write some collection programs, the following three functions are very common functions in the collection. Let's call it a universal Regular Expression of the collection system.
First: // obtain the HTML source code of the page
Public String gethtmlsource (string URL, string charset)
{
If (charset = "" | charset = NULL) charset = "gb2312 ";
String text1 = "";
Try
{
Httpwebrequest request1 = (httpwebrequest) webrequest. Create (URL );
Httpwebresponse response1 = (httpwebresponse) request1.getresponse ();
Stream stream1 = response1.getresponsestream ();
Streamreader reader1 = new streamreader (stream1, encoding. getencoding (charset ));
Text1 = reader1.readtoend ();
Stream1.close ();
Response1.close ();
}
Catch (exception exception1)
{
}
Return text1;
}
Second: truncates a string.
Public String sniffwebcode (string code, string wordsbegin, string wordsend)
{
String newstitle = "";
RegEx regex1 = new RegEx ("" + wordsbegin + @"(? <Title> [/S] + ?) "+ Wordsend +" ", regexoptions. Compiled | regexoptions. ignorecase );
For (match match1 = regex1.match (CODE); match1.success; match1 = match1.nextmatch ())
{
Newstitle = match1.groups ["title"]. tostring ();
}
Return newstitle;
}
Third: intercept the website
Public arraylist sniffwebcodereturnlist (string code, string wordsbegin, string wordsend)
{
Arraylist urllist = new arraylist ();
// String newstitle = "";
RegEx regex1 = new RegEx ("" + wordsbegin + @"(? <Title> [/S] + ?) "+ Wordsend +" ", regexoptions. Compiled | regexoptions. ignorecase );
For (match match1 = regex1.match (CODE); match1.success; match1 = match1.nextmatch ())
{
Urllist. Add (match1.groups ["title"]. tostring ());
}
Return urllist;
}
The Code is as follows:
Using system;
Using system. Collections. Generic;
Using system. text;
Using system. Data;
Using system. Data. oledb;
Using system. IO;
Using system. Text. regularexpressions;
Using system. text;
Using system. collections;
Using system. net;
Namespace getweb
{
Public class dbconn
{
// Public String dbconnstring = @ "User ID = sa; Data Source =.; Password = sa; initial catalog = getweb; provider = sqloledb.1 ";
Public String dbconnstring = @ "provider = Microsoft. Jet. oledb.4.0; Data Source = getweb. mdb ";
Public static string getsource (string URL, string charset)
{
If (charset = "" | charset = NULL) charset = "gb2312 ";
String text1 = "";
Try
{
Stream stream1 = new WebClient (). openread (URL );
Text1 = new streamreader (stream1, encoding. getencoding (charset). readtoend ();
Stream1.close ();
}
Catch (exception exception1)
{
}
Return text1;
}
Public String gethtmlsource (string URL, string charset)
{
If (charset = "" | charset = NULL) charset = "gb2312 ";
String text1 = "";
Try
{
Httpwebrequest request1 = (httpwebrequest) webrequest. Create (URL );
Httpwebresponse response1 = (httpwebresponse) request1.getresponse ();
Stream stream1 = response1.getresponsestream ();
Streamreader reader1 = new streamreader (stream1, encoding. getencoding (charset ));
Text1 = reader1.readtoend ();
Stream1.close ();
Response1.close ();
}
Catch (exception exception1)
{
}
Return text1;
}
Public String get_http (string a_strurl, int timeout)
{
String strresult;
Try
{
Httpwebrequest myreq = (httpwebrequest) httpwebrequest. Create (a_strurl );
Myreq. Timeout = timeout;
Httpwebresponse httpwresp = (httpwebresponse) myreq. getresponse ();
Stream mystream = httpwresp. getresponsestream ();
Streamreader sr = new streamreader (mystream, encoding. Default );
Stringbuilder strbuilder = new stringbuilder ();
While (-1! = Sr. Peek ())
{
Strbuilder. append (Sr. Readline () + "/R/N ");
}
Strresult = strbuilder. tostring ();
}
Catch (exception exp)
{
Strresult = "error:" + exp. message;
}
Return strresult;
}
// After obtaining the page content, the connection address on the Analysis page obtains the URL to be crawled:
// Process the page title and link
Public String sniffwebcode (string code, string wordsbegin, string wordsend)
{
String newstitle = "";
RegEx regex1 = new RegEx ("" + wordsbegin + @"(? <Title> [/S] + ?) "+ Wordsend +" ", regexoptions. Compiled | regexoptions. ignorecase );
For (match match1 = regex1.match (CODE); match1.success; match1 = match1.nextmatch ())
{
Newstitle = match1.groups ["title"]. tostring ();
}
Return newstitle;
}
Public arraylist sniffwebcodereturnlist (string code, string wordsbegin, string wordsend)
{
Arraylist urllist = new arraylist ();
// String newstitle = "";
RegEx regex1 = new RegEx ("" + wordsbegin + @"(? <Title> [/S] + ?) "+ Wordsend +" ", regexoptions. Compiled | regexoptions. ignorecase );
For (match match1 = regex1.match (CODE); match1.success; match1 = match1.nextmatch ())
{
Urllist. Add (match1.groups ["title"]. tostring ());
}
Return urllist;
}
}
}