Key words: idhttp, regexpr, regular expression, Link
Purpose: To obtain the link and link text on the web page.
Operation:
1. Use idhttp to obtain the source code of the webpage.
2. Use regexpr to define a regular expression to obtain the link and link text in the webpage.
The implementation method is as follows:
// Use a regular expression to match the source code of the webpage and extract the content.
Procedure getlinkfromhtml (const sourcehtmltxt, pattern: string; var alist: tstringlist );
VaR Regexp: tregexpr;
Hasmatch: Boolean;
Begin
Regexp: = tregexpr. Create;
Try
Regexp. expression: = pattern; // paradigm
Hasmatch: = Regexp. Exec (sourcehtmltxt );
While hasmatch do
Begin
// Alist. Add (Regexp. Match [0]);
Alist. Add (Regexp. Match [2] + '=' + Regexp. Match [1]);
Hasmatch: = Regexp. execnext;
End;
Finally
Freeandnil (Regexp );
End;
End;
// Parse the source code of the webpage and use a regular expression to read the required link.
Procedure parsepagecontent (pagetext: string; var alist: tstringlist );
VaR apattern: string;
Begin
// Regular Expression
// The link is like: <a href = "http: // ex01/public/abcdf.doc? Attach = 1 "target =" _ blank "> <font color =" #000000 "> attachment 1.doc (37kb) </font> </a>
Apattern: = '<A/S + href/S * =/S * "" {0} ([^>]. *?) /? Attach = 1 "" {0}/S * target/S * =/S * "" {0} _ blank "" {0}/S *>/S * <font /S + color/S * =/S * "" {0} #000000 "" {0}> (. *?) /([/D] +) KB/)/S * </font>/S * </a> ';
Getlinkfromhtml (pagetext, apattern, alist); // use a regular expression to match the webpage content.
End;
// Read the webpage source code and obtain the URL and name of the file from the webpage.
// Return parameter: hreflist.
Procedure getlinklist (efilelink, domain, username, passwd: string; var hreflist: tstringlist );
VaR aidhttp: tidhttp;
Webhttpcontent: string;
Begin
Aidhttp: = tidhttp. Create (NiL );
Try
Aidhttp. Request. basicauthentication: = true; // set the permission
Aidhttp. Request. Username: = domain + '/' + username;
Aidhttp. Request. Password: = passwd;
// Identify the browser
Aidhttp. Request. useragent: = 'mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2;. Net CLR 1.1.4322 )';
Aidhttp. request. accept: = 'image/GIF, image/X-xbitmap, image/JPEG, image/pjpeg, application/vnd. MS-Excel, application/vnd. MS-PowerPoint, application/MSWord ,*/*';
Aidhttp. Request. contenttype: = 'text/XML; charset = ''utf-8 ''';
Aidhttp. Request. Connection: = 'keep-alive ';
Aidhttp. Request. Method: = (hmet); // sets the request method.
// Obtain the webpage content (available as OWA)
// Efilelink: = 'HTTP: // ex01/public/workflow/notification about device type Adjustment ';
Webhttpcontent: = aidhttp. Get (efilelink );
// Utf8 Decoding
Webhttpcontent: = utf8decode (webhttpcontent );
// Parse the webpage source code
Parsepagecontent (webhttpcontent, hreflist );
Finally
Freeandnil (aidhttp );
End;
End;
by jrq
2007.05.28 Yu Shi